1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2022 Garrett D'Amore
27 * Copyright 2024 Oxide Computer Company
28 */
29
30 #include <sys/types.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/buf.h>
35 #include <sys/conf.h>
36 #include <sys/cred.h>
37 #include <sys/kmem.h>
38 #include <sys/kmem_impl.h>
39 #include <sys/sysmacros.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/debug.h>
43 #include <sys/errno.h>
44 #include <sys/time.h>
45 #include <sys/file.h>
46 #include <sys/open.h>
47 #include <sys/user.h>
48 #include <sys/termios.h>
49 #include <sys/stream.h>
50 #include <sys/strsubr.h>
51 #include <sys/strsun.h>
52 #include <sys/suntpi.h>
53 #include <sys/ddi.h>
54 #include <sys/esunddi.h>
55 #include <sys/flock.h>
56 #include <sys/modctl.h>
57 #include <sys/vtrace.h>
58 #include <sys/cmn_err.h>
59 #include <sys/pathname.h>
60
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 #include <sys/sockio.h>
64 #include <netinet/in.h>
65 #include <sys/un.h>
66 #include <sys/strsun.h>
67
68 #include <sys/tiuser.h>
69 #define _SUN_TPI_VERSION 2
70 #include <sys/tihdr.h>
71 #include <sys/timod.h> /* TI_GETMYNAME, TI_GETPEERNAME */
72
73 #include <c2/audit.h>
74
75 #include <inet/common.h>
76 #include <inet/ip.h>
77 #include <inet/ip6.h>
78 #include <inet/tcp.h>
79 #include <inet/udp_impl.h>
80
81 #include <sys/zone.h>
82
83 #include <fs/sockfs/sockcommon.h>
84 #include <fs/sockfs/socktpi.h>
85 #include <fs/sockfs/socktpi_impl.h>
86
87 /*
88 * Possible failures when memory can't be allocated. The documented behavior:
89 *
90 * 5.5: 4.X: XNET:
91 * accept: ENOMEM/ENOSR/EINTR - (EINTR) ENOMEM/ENOBUFS/ENOSR/
92 * EINTR
93 * (4.X does not document EINTR but returns it)
94 * bind: ENOSR - ENOBUFS/ENOSR
95 * connect: EINTR EINTR ENOBUFS/ENOSR/EINTR
96 * getpeername: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
97 * getsockname: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
98 * (4.X getpeername and getsockname do not fail in practice)
99 * getsockopt: ENOMEM/ENOSR - ENOBUFS/ENOSR
100 * listen: - - ENOBUFS
101 * recv: ENOMEM/ENOSR/EINTR EINTR ENOBUFS/ENOMEM/ENOSR/
102 * EINTR
103 * send: ENOMEM/ENOSR/EINTR ENOBUFS/EINTR ENOBUFS/ENOMEM/ENOSR/
104 * EINTR
105 * setsockopt: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
106 * shutdown: ENOMEM/ENOSR - ENOBUFS/ENOSR
107 * socket: ENOMEM/ENOSR ENOBUFS ENOBUFS/ENOMEM/ENOSR
108 * socketpair: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
109 *
110 * Resolution. When allocation fails:
111 * recv: return EINTR
112 * send: return EINTR
113 * connect, accept: EINTR
114 * bind, listen, shutdown (unbind, unix_close, disconnect): sleep
115 * socket, socketpair: ENOBUFS
116 * getpeername, getsockname: sleep
117 * getsockopt, setsockopt: sleep
118 */
119
120 #ifdef SOCK_TEST
121 /*
122 * Variables that make sockfs do something other than the standard TPI
123 * for the AF_INET transports.
124 *
125 * solisten_tpi_tcp:
126 * TCP can handle a O_T_BIND_REQ with an increased backlog even though
127 * the transport is already bound. This is needed to avoid loosing the
128 * port number should listen() do a T_UNBIND_REQ followed by a
129 * O_T_BIND_REQ.
130 *
131 * soconnect_tpi_udp:
132 * UDP and ICMP can handle a T_CONN_REQ.
133 * This is needed to make the sequence of connect(), getsockname()
134 * return the local IP address used to send packets to the connected to
135 * destination.
136 *
137 * soconnect_tpi_tcp:
138 * TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
139 * Set this to non-zero to send TPI conformant messages to TCP in this
140 * respect. This is a performance optimization.
141 *
142 * soaccept_tpi_tcp:
143 * TCP can handle a T_CONN_REQ without the acceptor being bound.
144 * This is a performance optimization that has been picked up in XTI.
145 *
146 * soaccept_tpi_multioptions:
147 * When inheriting SOL_SOCKET options from the listener to the accepting
148 * socket send them as a single message for AF_INET{,6}.
149 */
150 int solisten_tpi_tcp = 0;
151 int soconnect_tpi_udp = 0;
152 int soconnect_tpi_tcp = 0;
153 int soaccept_tpi_tcp = 0;
154 int soaccept_tpi_multioptions = 1;
155 #else /* SOCK_TEST */
156 #define soconnect_tpi_tcp 0
157 #define soconnect_tpi_udp 0
158 #define solisten_tpi_tcp 0
159 #define soaccept_tpi_tcp 0
160 #define soaccept_tpi_multioptions 1
161 #endif /* SOCK_TEST */
162
163 #ifdef SOCK_TEST
164 extern int do_useracc;
165 extern clock_t sock_test_timelimit;
166 #endif /* SOCK_TEST */
167
168 extern uint32_t ucredsize;
169
170 /*
171 * Some X/Open added checks might have to be backed out to keep SunOS 4.X
172 * applications working. Turn on this flag to disable these checks.
173 */
174 int xnet_skip_checks = 0;
175 int xnet_check_print = 0;
176 int xnet_truncate_print = 0;
177
178 static void sotpi_destroy(struct sonode *);
179 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
180 int, int *, cred_t *cr);
181
182 static boolean_t sotpi_info_create(struct sonode *, int);
183 static void sotpi_info_init(struct sonode *);
184 static void sotpi_info_fini(struct sonode *);
185 static void sotpi_info_destroy(struct sonode *);
186
187 /*
188 * Do direct function call to the transport layer below; this would
189 * also allow the transport to utilize read-side synchronous stream
190 * interface if necessary. This is a /etc/system tunable that must
191 * not be modified on a running system. By default this is enabled
192 * for performance reasons and may be disabled for debugging purposes.
193 */
194 boolean_t socktpi_direct = B_TRUE;
195
196 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
197
198 extern void sigintr(k_sigset_t *, int);
199 extern void sigunintr(k_sigset_t *);
200
201 static int sotpi_unbind(struct sonode *, int);
202
203 /* TPI sockfs sonode operations */
204 int sotpi_init(struct sonode *, struct sonode *, struct cred *,
205 int);
206 static int sotpi_accept(struct sonode *, int, struct cred *,
207 struct sonode **);
208 static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
209 int, struct cred *);
210 static int sotpi_listen(struct sonode *, int, struct cred *);
211 static int sotpi_connect(struct sonode *, struct sockaddr *,
212 socklen_t, int, int, struct cred *);
213 extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *,
214 struct uio *, struct cred *);
215 static int sotpi_sendmsg(struct sonode *, struct nmsghdr *,
216 struct uio *, struct cred *);
217 static int sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
218 struct cred *, mblk_t **);
219 static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
220 struct uio *, void *, t_uscalar_t, int);
221 static int sodgram_direct(struct sonode *, struct sockaddr *,
222 socklen_t, struct uio *, int);
223 extern int sotpi_getpeername(struct sonode *, struct sockaddr *,
224 socklen_t *, boolean_t, struct cred *);
225 static int sotpi_getsockname(struct sonode *, struct sockaddr *,
226 socklen_t *, struct cred *);
227 static int sotpi_shutdown(struct sonode *, int, struct cred *);
228 extern int sotpi_getsockopt(struct sonode *, int, int, void *,
229 socklen_t *, int, struct cred *);
230 extern int sotpi_setsockopt(struct sonode *, int, int, const void *,
231 socklen_t, struct cred *);
232 static int sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
233 int32_t *);
234 static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
235 struct cred *, int32_t *);
236 static int sotpi_poll(struct sonode *, short, int, short *,
237 struct pollhead **);
238 static int sotpi_close(struct sonode *, int, struct cred *);
239
240 static int i_sotpi_info_constructor(sotpi_info_t *);
241 static void i_sotpi_info_destructor(sotpi_info_t *);
242
243 sonodeops_t sotpi_sonodeops = {
244 sotpi_init, /* sop_init */
245 sotpi_accept, /* sop_accept */
246 sotpi_bind, /* sop_bind */
247 sotpi_listen, /* sop_listen */
248 sotpi_connect, /* sop_connect */
249 sotpi_recvmsg, /* sop_recvmsg */
250 sotpi_sendmsg, /* sop_sendmsg */
251 sotpi_sendmblk, /* sop_sendmblk */
252 sotpi_getpeername, /* sop_getpeername */
253 sotpi_getsockname, /* sop_getsockname */
254 sotpi_shutdown, /* sop_shutdown */
255 sotpi_getsockopt, /* sop_getsockopt */
256 sotpi_setsockopt, /* sop_setsockopt */
257 sotpi_ioctl, /* sop_ioctl */
258 sotpi_poll, /* sop_poll */
259 sotpi_close, /* sop_close */
260 };
261
262 /*
263 * Return a TPI socket vnode.
264 *
265 * Note that sockets assume that the driver will clone (either itself
266 * or by using the clone driver) i.e. a socket() call will always
267 * result in a new vnode being created.
268 */
269
270 /*
271 * Common create code for socket and accept. If tso is set the values
272 * from that node is used instead of issuing a T_INFO_REQ.
273 */
274
275 /* ARGSUSED */
276 static struct sonode *
sotpi_create(struct sockparams * sp,int family,int type,int protocol,int version,int sflags,int * errorp,cred_t * cr)277 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
278 int version, int sflags, int *errorp, cred_t *cr)
279 {
280 struct sonode *so;
281 kmem_cache_t *cp;
282
283 ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
284
285 /*
286 * to be compatible with old tpi socket implementation ignore
287 * sleep flag (sflags) passed in
288 */
289 cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
290 so = kmem_cache_alloc(cp, KM_SLEEP);
291 if (so == NULL) {
292 *errorp = ENOMEM;
293 return (NULL);
294 }
295
296 sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
297 sotpi_info_init(so);
298
299 if (version == SOV_DEFAULT)
300 version = so_default_version;
301
302 so->so_version = (short)version;
303 *errorp = 0;
304
305 return (so);
306 }
307
308 static void
sotpi_destroy(struct sonode * so)309 sotpi_destroy(struct sonode *so)
310 {
311 kmem_cache_t *cp;
312 struct sockparams *origsp;
313
314 /*
315 * If there is a new dealloc function (ie. smod_destroy_func),
316 * then it should check the correctness of the ops.
317 */
318
319 ASSERT(so->so_ops == &sotpi_sonodeops);
320
321 origsp = SOTOTPI(so)->sti_orig_sp;
322
323 sotpi_info_fini(so);
324
325 if (so->so_state & SS_FALLBACK_COMP) {
326 /*
327 * A fallback happend, which means that a sotpi_info_t struct
328 * was allocated (as opposed to being allocated from the TPI
329 * sonode cache. Therefore we explicitly free the struct
330 * here.
331 */
332 sotpi_info_destroy(so);
333 ASSERT(origsp != NULL);
334
335 origsp->sp_smod_info->smod_sock_destroy_func(so);
336 SOCKPARAMS_DEC_REF(origsp);
337 } else {
338 sonode_fini(so);
339 cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
340 socktpi_cache;
341 kmem_cache_free(cp, so);
342 }
343 }
344
345 /* ARGSUSED1 */
346 int
sotpi_init(struct sonode * so,struct sonode * tso,struct cred * cr,int flags)347 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
348 {
349 major_t maj;
350 dev_t newdev;
351 struct vnode *vp;
352 int error = 0;
353 struct stdata *stp;
354
355 sotpi_info_t *sti = SOTOTPI(so);
356
357 dprint(1, ("sotpi_init()\n"));
358
359 /*
360 * over write the sleep flag passed in but that is ok
361 * as tpi socket does not honor sleep flag.
362 */
363 flags |= FREAD|FWRITE;
364
365 /*
366 * Record in so_flag that it is a clone.
367 */
368 if (getmajor(sti->sti_dev) == clone_major)
369 so->so_flag |= SOCLONE;
370
371 if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
372 (so->so_family == AF_INET || so->so_family == AF_INET6) &&
373 (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
374 so->so_protocol == IPPROTO_IP)) {
375 /* Tell tcp or udp that it's talking to sockets */
376 flags |= SO_SOCKSTR;
377
378 /*
379 * Here we indicate to socktpi_open() our attempt to
380 * make direct calls between sockfs and transport.
381 * The final decision is left to socktpi_open().
382 */
383 sti->sti_direct = 1;
384
385 ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
386 if (so->so_type == SOCK_STREAM && tso != NULL) {
387 if (SOTOTPI(tso)->sti_direct) {
388 /*
389 * Inherit sti_direct from listener and pass
390 * SO_ACCEPTOR open flag to tcp, indicating
391 * that this is an accept fast-path instance.
392 */
393 flags |= SO_ACCEPTOR;
394 } else {
395 /*
396 * sti_direct is not set on listener, meaning
397 * that the listener has been converted from
398 * a socket to a stream. Ensure that the
399 * acceptor inherits these settings.
400 */
401 sti->sti_direct = 0;
402 flags &= ~SO_SOCKSTR;
403 }
404 }
405 }
406
407 /*
408 * Tell local transport that it is talking to sockets.
409 */
410 if (so->so_family == AF_UNIX) {
411 flags |= SO_SOCKSTR;
412 }
413
414 vp = SOTOV(so);
415 newdev = vp->v_rdev;
416 maj = getmajor(newdev);
417 ASSERT(STREAMSTAB(maj));
418
419 error = stropen(vp, &newdev, flags, cr);
420
421 stp = vp->v_stream;
422 if (error == 0) {
423 if (so->so_flag & SOCLONE)
424 ASSERT(newdev != vp->v_rdev);
425 mutex_enter(&so->so_lock);
426 sti->sti_dev = newdev;
427 vp->v_rdev = newdev;
428 mutex_exit(&so->so_lock);
429
430 if (stp->sd_flag & STRISTTY) {
431 /*
432 * this is a post SVR4 tty driver - a socket can not
433 * be a controlling terminal. Fail the open.
434 */
435 (void) sotpi_close(so, flags, cr);
436 return (ENOTTY); /* XXX */
437 }
438
439 ASSERT(stp->sd_wrq != NULL);
440 sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
441
442 /*
443 * If caller is interested in doing direct function call
444 * interface to/from transport module, probe the module
445 * directly beneath the streamhead to see if it qualifies.
446 *
447 * We turn off the direct interface when qualifications fail.
448 * In the acceptor case, we simply turn off the sti_direct
449 * flag on the socket. We do the fallback after the accept
450 * has completed, before the new socket is returned to the
451 * application.
452 */
453 if (sti->sti_direct) {
454 queue_t *tq = stp->sd_wrq->q_next;
455
456 /*
457 * sti_direct is currently supported and tested
458 * only for tcp/udp; this is the main reason to
459 * have the following assertions.
460 */
461 ASSERT(so->so_family == AF_INET ||
462 so->so_family == AF_INET6);
463 ASSERT(so->so_protocol == IPPROTO_UDP ||
464 so->so_protocol == IPPROTO_TCP ||
465 so->so_protocol == IPPROTO_IP);
466 ASSERT(so->so_type == SOCK_DGRAM ||
467 so->so_type == SOCK_STREAM);
468
469 /*
470 * Abort direct call interface if the module directly
471 * underneath the stream head is not defined with the
472 * _D_DIRECT flag. This could happen in the tcp or
473 * udp case, when some other module is autopushed
474 * above it, or for some reasons the expected module
475 * isn't purely D_MP (which is the main requirement).
476 */
477 if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
478 !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
479 int rval;
480
481 /* Continue on without direct calls */
482 sti->sti_direct = 0;
483
484 /*
485 * Cannot issue ioctl on fallback socket since
486 * there is no conn associated with the queue.
487 * The fallback downcall will notify the proto
488 * of the change.
489 */
490 if (!(flags & SO_ACCEPTOR) &&
491 !(flags & SO_FALLBACK)) {
492 if ((error = strioctl(vp,
493 _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
494 cr, &rval)) != 0) {
495 (void) sotpi_close(so, flags,
496 cr);
497 return (error);
498 }
499 }
500 }
501 }
502
503 if (flags & SO_FALLBACK) {
504 /*
505 * The stream created does not have a conn.
506 * do stream set up after conn has been assigned
507 */
508 return (error);
509 }
510 error = so_strinit(so, tso);
511 if (error != 0) {
512 (void) sotpi_close(so, flags, cr);
513 return (error);
514 }
515
516 /* Enable sendfile() on AF_UNIX streams */
517 if (so->so_family == AF_UNIX && so->so_type == SOCK_STREAM) {
518 mutex_enter(&so->so_lock);
519 so->so_mode |= SM_SENDFILESUPP;
520 mutex_exit(&so->so_lock);
521 }
522
523 /* Wildcard */
524 if (so->so_protocol != so->so_sockparams->sp_protocol) {
525 int protocol = so->so_protocol;
526 /*
527 * Issue SO_PROTOTYPE setsockopt.
528 */
529 error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
530 &protocol, (t_uscalar_t)sizeof (protocol), cr);
531 if (error != 0) {
532 (void) sotpi_close(so, flags, cr);
533 /*
534 * Setsockopt often fails with ENOPROTOOPT but
535 * socket() should fail with
536 * EPROTONOSUPPORT/EPROTOTYPE.
537 */
538 return (EPROTONOSUPPORT);
539 }
540 }
541
542 } else {
543 /*
544 * While the same socket can not be reopened (unlike specfs)
545 * the stream head sets STREOPENFAIL when the autopush fails.
546 */
547 if ((stp != NULL) &&
548 (stp->sd_flag & STREOPENFAIL)) {
549 /*
550 * Open failed part way through.
551 */
552 mutex_enter(&stp->sd_lock);
553 stp->sd_flag &= ~STREOPENFAIL;
554 mutex_exit(&stp->sd_lock);
555 (void) sotpi_close(so, flags, cr);
556 return (error);
557 /*NOTREACHED*/
558 }
559 ASSERT(stp == NULL);
560 }
561 TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
562 "sockfs open:maj %d vp %p so %p error %d",
563 maj, vp, so, error);
564 return (error);
565 }
566
567 /*
568 * Bind the socket to an unspecified address in sockfs only.
569 * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
570 * required in all cases.
571 */
572 static void
so_automatic_bind(struct sonode * so)573 so_automatic_bind(struct sonode *so)
574 {
575 sotpi_info_t *sti = SOTOTPI(so);
576 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
577
578 ASSERT(MUTEX_HELD(&so->so_lock));
579 ASSERT(!(so->so_state & SS_ISBOUND));
580 ASSERT(sti->sti_unbind_mp);
581
582 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
583 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
584 sti->sti_laddr_sa->sa_family = so->so_family;
585 so->so_state |= SS_ISBOUND;
586 }
587
588
589 /*
590 * bind the socket.
591 *
592 * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
593 * are passed in we allow rebinding. Note that for backwards compatibility
594 * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
595 * Thus the rebinding code is currently not executed.
596 *
597 * The constraints for rebinding are:
598 * - it is a SOCK_DGRAM, or
599 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
600 * and no listen() has been done.
601 * This rebinding code was added based on some language in the XNET book
602 * about not returning EINVAL it the protocol allows rebinding. However,
603 * this language is not present in the Posix socket draft. Thus maybe the
604 * rebinding logic should be deleted from the source.
605 *
606 * A null "name" can be used to unbind the socket if:
607 * - it is a SOCK_DGRAM, or
608 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
609 * and no listen() has been done.
610 */
611 /* ARGSUSED */
612 static int
sotpi_bindlisten(struct sonode * so,struct sockaddr * name,socklen_t namelen,int backlog,int flags,struct cred * cr)613 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
614 socklen_t namelen, int backlog, int flags, struct cred *cr)
615 {
616 struct T_bind_req bind_req;
617 struct T_bind_ack *bind_ack;
618 int error = 0;
619 mblk_t *mp;
620 void *addr;
621 t_uscalar_t addrlen;
622 int unbind_on_err = 1;
623 boolean_t clear_acceptconn_on_err = B_FALSE;
624 boolean_t restore_backlog_on_err = B_FALSE;
625 int save_so_backlog = 0;
626 t_scalar_t PRIM_type = O_T_BIND_REQ;
627 boolean_t tcp_udp_xport;
628 sotpi_info_t *sti = SOTOTPI(so);
629
630 dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
631 (void *)so, (void *)name, namelen, backlog, flags,
632 pr_state(so->so_state, so->so_mode)));
633
634 tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
635
636 if (!(flags & _SOBIND_LOCK_HELD)) {
637 mutex_enter(&so->so_lock);
638 so_lock_single(so); /* Set SOLOCKED */
639 } else {
640 ASSERT(MUTEX_HELD(&so->so_lock));
641 ASSERT(so->so_flag & SOLOCKED);
642 }
643
644 /*
645 * Make sure that there is a preallocated unbind_req message
646 * before binding. This message allocated when the socket is
647 * created but it might be have been consumed.
648 */
649 if (sti->sti_unbind_mp == NULL) {
650 dprintso(so, 1, ("sobind: allocating unbind_req\n"));
651 /* NOTE: holding so_lock while sleeping */
652 sti->sti_unbind_mp =
653 soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
654 cr);
655 }
656
657 if (flags & _SOBIND_REBIND) {
658 /*
659 * Called from solisten after doing an sotpi_unbind() or
660 * potentially without the unbind (latter for AF_INET{,6}).
661 */
662 ASSERT(name == NULL && namelen == 0);
663
664 if (so->so_family == AF_UNIX) {
665 ASSERT(sti->sti_ux_bound_vp);
666 addr = &sti->sti_ux_laddr;
667 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
668 dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
669 "addr 0x%p, vp %p\n",
670 addrlen,
671 (void *)((struct so_ux_addr *)addr)->soua_vp,
672 (void *)sti->sti_ux_bound_vp));
673 } else {
674 addr = sti->sti_laddr_sa;
675 addrlen = (t_uscalar_t)sti->sti_laddr_len;
676 }
677 } else if (flags & _SOBIND_UNSPEC) {
678 ASSERT(name == NULL && namelen == 0);
679
680 /*
681 * The caller checked SS_ISBOUND but not necessarily
682 * under so_lock
683 */
684 if (so->so_state & SS_ISBOUND) {
685 /* No error */
686 goto done;
687 }
688
689 /* Set an initial local address */
690 switch (so->so_family) {
691 case AF_UNIX:
692 /*
693 * Use an address with same size as struct sockaddr
694 * just like BSD.
695 */
696 sti->sti_laddr_len =
697 (socklen_t)sizeof (struct sockaddr);
698 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
699 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
700 sti->sti_laddr_sa->sa_family = so->so_family;
701
702 /*
703 * Pass down an address with the implicit bind
704 * magic number and the rest all zeros.
705 * The transport will return a unique address.
706 */
707 sti->sti_ux_laddr.soua_vp = NULL;
708 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
709 addr = &sti->sti_ux_laddr;
710 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
711 break;
712
713 case AF_INET:
714 case AF_INET6:
715 /*
716 * An unspecified bind in TPI has a NULL address.
717 * Set the address in sockfs to have the sa_family.
718 */
719 sti->sti_laddr_len = (so->so_family == AF_INET) ?
720 (socklen_t)sizeof (sin_t) :
721 (socklen_t)sizeof (sin6_t);
722 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
723 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
724 sti->sti_laddr_sa->sa_family = so->so_family;
725 addr = NULL;
726 addrlen = 0;
727 break;
728
729 default:
730 /*
731 * An unspecified bind in TPI has a NULL address.
732 * Set the address in sockfs to be zero length.
733 *
734 * Can not assume there is a sa_family for all
735 * protocol families. For example, AF_X25 does not
736 * have a family field.
737 */
738 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
739 sti->sti_laddr_len = 0; /* XXX correct? */
740 addr = NULL;
741 addrlen = 0;
742 break;
743 }
744
745 } else {
746 if (so->so_state & SS_ISBOUND) {
747 /*
748 * If it is ok to rebind the socket, first unbind
749 * with the transport. A rebind to the NULL address
750 * is interpreted as an unbind.
751 * Note that a bind to NULL in BSD does unbind the
752 * socket but it fails with EINVAL.
753 * Note that regular sockets set SOV_SOCKBSD i.e.
754 * _SOBIND_SOCKBSD gets set here hence no type of
755 * socket does currently allow rebinding.
756 *
757 * If the name is NULL just do an unbind.
758 */
759 if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
760 name != NULL) {
761 error = EINVAL;
762 unbind_on_err = 0;
763 eprintsoline(so, error);
764 goto done;
765 }
766 if ((so->so_mode & SM_CONNREQUIRED) &&
767 (so->so_state & SS_CANTREBIND)) {
768 error = EINVAL;
769 unbind_on_err = 0;
770 eprintsoline(so, error);
771 goto done;
772 }
773 error = sotpi_unbind(so, 0);
774 if (error) {
775 eprintsoline(so, error);
776 goto done;
777 }
778 ASSERT(!(so->so_state & SS_ISBOUND));
779 if (name == NULL) {
780 so->so_state &=
781 ~(SS_ISCONNECTED|SS_ISCONNECTING);
782 goto done;
783 }
784 }
785
786 /* X/Open requires this check */
787 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
788 if (xnet_check_print) {
789 printf("sockfs: X/Open bind state check "
790 "caused EINVAL\n");
791 }
792 error = EINVAL;
793 goto done;
794 }
795
796 switch (so->so_family) {
797 case AF_UNIX:
798 /*
799 * All AF_UNIX addresses are nul terminated
800 * when copied (copyin_name) in so the minimum
801 * length is 3 bytes.
802 */
803 if (name == NULL ||
804 (ssize_t)namelen <= sizeof (short) + 1) {
805 error = EISDIR;
806 eprintsoline(so, error);
807 goto done;
808 }
809 /*
810 * Verify so_family matches the bound family.
811 * BSD does not check this for AF_UNIX resulting
812 * in funny mknods.
813 */
814 if (name->sa_family != so->so_family) {
815 error = EAFNOSUPPORT;
816 goto done;
817 }
818 break;
819 case AF_INET:
820 if (name == NULL) {
821 error = EINVAL;
822 eprintsoline(so, error);
823 goto done;
824 }
825 if ((size_t)namelen != sizeof (sin_t)) {
826 error = name->sa_family != so->so_family ?
827 EAFNOSUPPORT : EINVAL;
828 eprintsoline(so, error);
829 goto done;
830 }
831 if ((flags & _SOBIND_XPG4_2) &&
832 (name->sa_family != so->so_family)) {
833 /*
834 * This check has to be made for X/Open
835 * sockets however application failures have
836 * been observed when it is applied to
837 * all sockets.
838 */
839 error = EAFNOSUPPORT;
840 eprintsoline(so, error);
841 goto done;
842 }
843 /*
844 * Force a zero sa_family to match so_family.
845 *
846 * Some programs like inetd(8) don't set the
847 * family field. Other programs leave
848 * sin_family set to garbage - SunOS 4.X does
849 * not check the family field on a bind.
850 * We use the family field that
851 * was passed in to the socket() call.
852 */
853 name->sa_family = so->so_family;
854 break;
855
856 case AF_INET6: {
857 #ifdef DEBUG
858 sin6_t *sin6 = (sin6_t *)name;
859 #endif /* DEBUG */
860
861 if (name == NULL) {
862 error = EINVAL;
863 eprintsoline(so, error);
864 goto done;
865 }
866 if ((size_t)namelen != sizeof (sin6_t)) {
867 error = name->sa_family != so->so_family ?
868 EAFNOSUPPORT : EINVAL;
869 eprintsoline(so, error);
870 goto done;
871 }
872 if (name->sa_family != so->so_family) {
873 /*
874 * With IPv6 we require the family to match
875 * unlike in IPv4.
876 */
877 error = EAFNOSUPPORT;
878 eprintsoline(so, error);
879 goto done;
880 }
881 #ifdef DEBUG
882 /*
883 * Verify that apps don't forget to clear
884 * sin6_scope_id etc
885 */
886 if (sin6->sin6_scope_id != 0 &&
887 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
888 zcmn_err(getzoneid(), CE_WARN,
889 "bind with uninitialized sin6_scope_id "
890 "(%d) on socket. Pid = %d\n",
891 (int)sin6->sin6_scope_id,
892 (int)curproc->p_pid);
893 }
894 if (sin6->__sin6_src_id != 0) {
895 zcmn_err(getzoneid(), CE_WARN,
896 "bind with uninitialized __sin6_src_id "
897 "(%d) on socket. Pid = %d\n",
898 (int)sin6->__sin6_src_id,
899 (int)curproc->p_pid);
900 }
901 #endif /* DEBUG */
902 break;
903 }
904 default:
905 /*
906 * Don't do any length or sa_family check to allow
907 * non-sockaddr style addresses.
908 */
909 if (name == NULL) {
910 error = EINVAL;
911 eprintsoline(so, error);
912 goto done;
913 }
914 break;
915 }
916
917 if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
918 error = ENAMETOOLONG;
919 eprintsoline(so, error);
920 goto done;
921 }
922 /*
923 * Save local address.
924 */
925 sti->sti_laddr_len = (socklen_t)namelen;
926 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
927 bcopy(name, sti->sti_laddr_sa, namelen);
928
929 addr = sti->sti_laddr_sa;
930 addrlen = (t_uscalar_t)sti->sti_laddr_len;
931 switch (so->so_family) {
932 case AF_INET6:
933 case AF_INET:
934 break;
935 case AF_UNIX: {
936 struct sockaddr_un *soun =
937 (struct sockaddr_un *)sti->sti_laddr_sa;
938 struct vnode *vp, *rvp;
939 struct vattr vattr;
940
941 ASSERT(sti->sti_ux_bound_vp == NULL);
942 /*
943 * Create vnode for the specified path name.
944 * Keep vnode held with a reference in sti_ux_bound_vp.
945 * Use the vnode pointer as the address used in the
946 * bind with the transport.
947 *
948 * Use the same mode as in BSD. In particular this does
949 * not observe the umask.
950 */
951 /* MAXPATHLEN + soun_family + nul termination */
952 if (sti->sti_laddr_len >
953 (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
954 error = ENAMETOOLONG;
955 eprintsoline(so, error);
956 goto done;
957 }
958 vattr.va_type = VSOCK;
959 vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
960 vattr.va_mask = AT_TYPE|AT_MODE;
961 /* NOTE: holding so_lock */
962 error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
963 EXCL, 0, &vp, CRMKNOD, 0, 0);
964 if (error) {
965 if (error == EEXIST)
966 error = EADDRINUSE;
967 eprintsoline(so, error);
968 goto done;
969 }
970 /*
971 * Establish pointer from the underlying filesystem
972 * vnode to the socket node.
973 * sti_ux_bound_vp and v_stream->sd_vnode form the
974 * cross-linkage between the underlying filesystem
975 * node and the socket node.
976 */
977
978 if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
979 VN_HOLD(rvp);
980 VN_RELE(vp);
981 vp = rvp;
982 }
983
984 ASSERT(SOTOV(so)->v_stream);
985 mutex_enter(&vp->v_lock);
986 vp->v_stream = SOTOV(so)->v_stream;
987 sti->sti_ux_bound_vp = vp;
988 mutex_exit(&vp->v_lock);
989
990 /*
991 * Use the vnode pointer value as a unique address
992 * (together with the magic number to avoid conflicts
993 * with implicit binds) in the transport provider.
994 */
995 sti->sti_ux_laddr.soua_vp =
996 (void *)sti->sti_ux_bound_vp;
997 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
998 addr = &sti->sti_ux_laddr;
999 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1000 dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1001 addrlen,
1002 (void *)((struct so_ux_addr *)addr)->soua_vp));
1003 break;
1004 }
1005 } /* end switch (so->so_family) */
1006 }
1007
1008 /*
1009 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1010 * the transport can start passing up T_CONN_IND messages
1011 * as soon as it receives the bind req and strsock_proto()
1012 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1013 */
1014 if (flags & _SOBIND_LISTEN) {
1015 if ((so->so_state & SS_ACCEPTCONN) == 0)
1016 clear_acceptconn_on_err = B_TRUE;
1017 save_so_backlog = so->so_backlog;
1018 restore_backlog_on_err = B_TRUE;
1019 so->so_state |= SS_ACCEPTCONN;
1020 so->so_backlog = backlog;
1021 }
1022
1023 /*
1024 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1025 * for other transports we will send in a O_T_BIND_REQ.
1026 */
1027 if (tcp_udp_xport &&
1028 (so->so_family == AF_INET || so->so_family == AF_INET6))
1029 PRIM_type = T_BIND_REQ;
1030
1031 bind_req.PRIM_type = PRIM_type;
1032 bind_req.ADDR_length = addrlen;
1033 bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1034 bind_req.CONIND_number = backlog;
1035 /* NOTE: holding so_lock while sleeping */
1036 mp = soallocproto2(&bind_req, sizeof (bind_req),
1037 addr, addrlen, 0, _ALLOC_SLEEP, cr);
1038 sti->sti_laddr_valid = 0;
1039
1040 /* Done using sti_laddr_sa - can drop the lock */
1041 mutex_exit(&so->so_lock);
1042
1043 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1044 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1045 if (error) {
1046 eprintsoline(so, error);
1047 mutex_enter(&so->so_lock);
1048 goto done;
1049 }
1050
1051 mutex_enter(&so->so_lock);
1052 error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1053 (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1054 if (error) {
1055 eprintsoline(so, error);
1056 goto done;
1057 }
1058 ASSERT(mp);
1059 /*
1060 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1061 * strsock_proto while the lock was dropped above, the bind
1062 * is allowed to complete.
1063 */
1064
1065 /* Mark as bound. This will be undone if we detect errors below. */
1066 if (flags & _SOBIND_NOXLATE) {
1067 ASSERT(so->so_family == AF_UNIX);
1068 sti->sti_faddr_noxlate = 1;
1069 }
1070 ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1071 so->so_state |= SS_ISBOUND;
1072 ASSERT(sti->sti_unbind_mp);
1073
1074 /* note that we've already set SS_ACCEPTCONN above */
1075
1076 /*
1077 * Recompute addrlen - an unspecied bind sent down an
1078 * address of length zero but we expect the appropriate length
1079 * in return.
1080 */
1081 addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1082 sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1083
1084 bind_ack = (struct T_bind_ack *)mp->b_rptr;
1085 /*
1086 * The alignment restriction is really too strict but
1087 * we want enough alignment to inspect the fields of
1088 * a sockaddr_in.
1089 */
1090 addr = sogetoff(mp, bind_ack->ADDR_offset,
1091 bind_ack->ADDR_length,
1092 __TPI_ALIGN_SIZE);
1093 if (addr == NULL) {
1094 freemsg(mp);
1095 error = EPROTO;
1096 eprintsoline(so, error);
1097 goto done;
1098 }
1099 if (!(flags & _SOBIND_UNSPEC)) {
1100 /*
1101 * Verify that the transport didn't return something we
1102 * did not want e.g. an address other than what we asked for.
1103 *
1104 * NOTE: These checks would go away if/when we switch to
1105 * using the new TPI (in which the transport would fail
1106 * the request instead of assigning a different address).
1107 *
1108 * NOTE2: For protocols that we don't know (i.e. any
1109 * other than AF_INET6, AF_INET and AF_UNIX), we
1110 * cannot know if the transport should be expected to
1111 * return the same address as that requested.
1112 *
1113 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1114 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1115 *
1116 * For example, in the case of netatalk it may be
1117 * inappropriate for the transport to return the
1118 * requested address (as it may have allocated a local
1119 * port number in behaviour similar to that of an
1120 * AF_INET bind request with a port number of zero).
1121 *
1122 * Given the definition of O_T_BIND_REQ, where the
1123 * transport may bind to an address other than the
1124 * requested address, it's not possible to determine
1125 * whether a returned address that differs from the
1126 * requested address is a reason to fail (because the
1127 * requested address was not available) or succeed
1128 * (because the transport allocated an appropriate
1129 * address and/or port).
1130 *
1131 * sockfs currently requires that the transport return
1132 * the requested address in the T_BIND_ACK, unless
1133 * there is code here to allow for any discrepancy.
1134 * Such code exists for AF_INET and AF_INET6.
1135 *
1136 * Netatalk chooses to return the requested address
1137 * rather than the (correct) allocated address. This
1138 * means that netatalk violates the TPI specification
1139 * (and would not function correctly if used from a
1140 * TLI application), but it does mean that it works
1141 * with sockfs.
1142 *
1143 * As noted above, using the newer XTI bind primitive
1144 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1145 * allow sockfs to be more sure about whether or not
1146 * the bind request had succeeded (as transports are
1147 * not permitted to bind to a different address than
1148 * that requested - they must return failure).
1149 * Unfortunately, support for T_BIND_REQ may not be
1150 * present in all transport implementations (netatalk,
1151 * for example, doesn't have it), making the
1152 * transition difficult.
1153 */
1154 if (bind_ack->ADDR_length != addrlen) {
1155 /* Assumes that the requested address was in use */
1156 freemsg(mp);
1157 error = EADDRINUSE;
1158 eprintsoline(so, error);
1159 goto done;
1160 }
1161
1162 switch (so->so_family) {
1163 case AF_INET6:
1164 case AF_INET: {
1165 sin_t *rname, *aname;
1166
1167 rname = (sin_t *)addr;
1168 aname = (sin_t *)sti->sti_laddr_sa;
1169
1170 /*
1171 * Take advantage of the alignment
1172 * of sin_port and sin6_port which fall
1173 * in the same place in their data structures.
1174 * Just use sin_port for either address family.
1175 *
1176 * This may become a problem if (heaven forbid)
1177 * there's a separate ipv6port_reserved... :-P
1178 *
1179 * Binding to port 0 has the semantics of letting
1180 * the transport bind to any port.
1181 *
1182 * If the transport is TCP or UDP since we had sent
1183 * a T_BIND_REQ we would not get a port other than
1184 * what we asked for.
1185 */
1186 if (tcp_udp_xport) {
1187 /*
1188 * Pick up the new port number if we bound to
1189 * port 0.
1190 */
1191 if (aname->sin_port == 0)
1192 aname->sin_port = rname->sin_port;
1193 sti->sti_laddr_valid = 1;
1194 break;
1195 }
1196 if (aname->sin_port != 0 &&
1197 aname->sin_port != rname->sin_port) {
1198 freemsg(mp);
1199 error = EADDRINUSE;
1200 eprintsoline(so, error);
1201 goto done;
1202 }
1203 /*
1204 * Pick up the new port number if we bound to port 0.
1205 */
1206 aname->sin_port = rname->sin_port;
1207
1208 /*
1209 * Unfortunately, addresses aren't _quite_ the same.
1210 */
1211 if (so->so_family == AF_INET) {
1212 if (aname->sin_addr.s_addr !=
1213 rname->sin_addr.s_addr) {
1214 freemsg(mp);
1215 error = EADDRNOTAVAIL;
1216 eprintsoline(so, error);
1217 goto done;
1218 }
1219 } else {
1220 sin6_t *rname6 = (sin6_t *)rname;
1221 sin6_t *aname6 = (sin6_t *)aname;
1222
1223 if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1224 &rname6->sin6_addr)) {
1225 freemsg(mp);
1226 error = EADDRNOTAVAIL;
1227 eprintsoline(so, error);
1228 goto done;
1229 }
1230 }
1231 break;
1232 }
1233 case AF_UNIX:
1234 if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1235 freemsg(mp);
1236 error = EADDRINUSE;
1237 eprintsoline(so, error);
1238 eprintso(so,
1239 ("addrlen %d, addr 0x%x, vp %p\n",
1240 addrlen, *((int *)addr),
1241 (void *)sti->sti_ux_bound_vp));
1242 goto done;
1243 }
1244 sti->sti_laddr_valid = 1;
1245 break;
1246 default:
1247 /*
1248 * NOTE: This assumes that addresses can be
1249 * byte-compared for equivalence.
1250 */
1251 if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1252 freemsg(mp);
1253 error = EADDRINUSE;
1254 eprintsoline(so, error);
1255 goto done;
1256 }
1257 /*
1258 * Don't mark sti_laddr_valid, as we cannot be
1259 * sure that the returned address is the real
1260 * bound address when talking to an unknown
1261 * transport.
1262 */
1263 break;
1264 }
1265 } else {
1266 /*
1267 * Save for returned address for getsockname.
1268 * Needed for unspecific bind unless transport supports
1269 * the TI_GETMYNAME ioctl.
1270 * Do this for AF_INET{,6} even though they do, as
1271 * caching info here is much better performance than
1272 * a TPI/STREAMS trip to the transport for getsockname.
1273 * Any which can't for some reason _must_ _not_ set
1274 * sti_laddr_valid here for the caching version of
1275 * getsockname to not break;
1276 */
1277 switch (so->so_family) {
1278 case AF_UNIX:
1279 /*
1280 * Record the address bound with the transport
1281 * for use by socketpair.
1282 */
1283 bcopy(addr, &sti->sti_ux_laddr, addrlen);
1284 sti->sti_laddr_valid = 1;
1285 break;
1286 case AF_INET:
1287 case AF_INET6:
1288 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1289 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1290 sti->sti_laddr_valid = 1;
1291 break;
1292 default:
1293 /*
1294 * Don't mark sti_laddr_valid, as we cannot be
1295 * sure that the returned address is the real
1296 * bound address when talking to an unknown
1297 * transport.
1298 */
1299 break;
1300 }
1301 }
1302
1303 freemsg(mp);
1304
1305 done:
1306 if (error) {
1307 /* reset state & backlog to values held on entry */
1308 if (clear_acceptconn_on_err == B_TRUE)
1309 so->so_state &= ~SS_ACCEPTCONN;
1310 if (restore_backlog_on_err == B_TRUE)
1311 so->so_backlog = save_so_backlog;
1312
1313 if (unbind_on_err && so->so_state & SS_ISBOUND) {
1314 int err;
1315
1316 err = sotpi_unbind(so, 0);
1317 /* LINTED - statement has no consequent: if */
1318 if (err) {
1319 eprintsoline(so, error);
1320 } else {
1321 ASSERT(!(so->so_state & SS_ISBOUND));
1322 }
1323 }
1324 }
1325 if (!(flags & _SOBIND_LOCK_HELD)) {
1326 so_unlock_single(so, SOLOCKED);
1327 mutex_exit(&so->so_lock);
1328 } else {
1329 ASSERT(MUTEX_HELD(&so->so_lock));
1330 ASSERT(so->so_flag & SOLOCKED);
1331 }
1332 return (error);
1333 }
1334
1335 /* bind the socket */
1336 static int
sotpi_bind(struct sonode * so,struct sockaddr * name,socklen_t namelen,int flags,struct cred * cr)1337 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1338 int flags, struct cred *cr)
1339 {
1340 if ((flags & _SOBIND_SOCKETPAIR) == 0)
1341 return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1342
1343 flags &= ~_SOBIND_SOCKETPAIR;
1344 return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1345 }
1346
1347 /*
1348 * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1349 * address, or when listen needs to unbind and bind.
1350 * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1351 * so that a sobind can pick them up.
1352 */
1353 static int
sotpi_unbind(struct sonode * so,int flags)1354 sotpi_unbind(struct sonode *so, int flags)
1355 {
1356 struct T_unbind_req unbind_req;
1357 int error = 0;
1358 mblk_t *mp;
1359 sotpi_info_t *sti = SOTOTPI(so);
1360
1361 dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1362 (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1363
1364 ASSERT(MUTEX_HELD(&so->so_lock));
1365 ASSERT(so->so_flag & SOLOCKED);
1366
1367 if (!(so->so_state & SS_ISBOUND)) {
1368 error = EINVAL;
1369 eprintsoline(so, error);
1370 goto done;
1371 }
1372
1373 mutex_exit(&so->so_lock);
1374
1375 /*
1376 * Flush the read and write side (except stream head read queue)
1377 * and send down T_UNBIND_REQ.
1378 */
1379 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1380
1381 unbind_req.PRIM_type = T_UNBIND_REQ;
1382 mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1383 0, _ALLOC_SLEEP, CRED());
1384 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1385 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1386 mutex_enter(&so->so_lock);
1387 if (error) {
1388 eprintsoline(so, error);
1389 goto done;
1390 }
1391
1392 error = sowaitokack(so, T_UNBIND_REQ);
1393 if (error) {
1394 eprintsoline(so, error);
1395 goto done;
1396 }
1397
1398 /*
1399 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1400 * strsock_proto while the lock was dropped above, the unbind
1401 * is allowed to complete.
1402 */
1403 if (!(flags & _SOUNBIND_REBIND)) {
1404 /*
1405 * Clear out bound address.
1406 */
1407 vnode_t *vp;
1408
1409 if ((vp = sti->sti_ux_bound_vp) != NULL) {
1410 sti->sti_ux_bound_vp = NULL;
1411 vn_rele_stream(vp);
1412 }
1413 /* Clear out address */
1414 sti->sti_laddr_len = 0;
1415 }
1416 so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1417 sti->sti_laddr_valid = 0;
1418
1419 done:
1420
1421 /* If the caller held the lock don't release it here */
1422 ASSERT(MUTEX_HELD(&so->so_lock));
1423 ASSERT(so->so_flag & SOLOCKED);
1424
1425 return (error);
1426 }
1427
1428 /*
1429 * listen on the socket.
1430 * For TPI conforming transports this has to first unbind with the transport
1431 * and then bind again using the new backlog.
1432 */
1433 /* ARGSUSED */
1434 int
sotpi_listen(struct sonode * so,int backlog,struct cred * cr)1435 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1436 {
1437 int error = 0;
1438 sotpi_info_t *sti = SOTOTPI(so);
1439
1440 dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1441 (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1442
1443 if (sti->sti_serv_type == T_CLTS)
1444 return (EOPNOTSUPP);
1445
1446 /*
1447 * If the socket is ready to accept connections already, then
1448 * return without doing anything. This avoids a problem where
1449 * a second listen() call fails if a connection is pending and
1450 * leaves the socket unbound. Only when we are not unbinding
1451 * with the transport can we safely increase the backlog.
1452 */
1453 if (so->so_state & SS_ACCEPTCONN &&
1454 !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1455 /*CONSTCOND*/
1456 !solisten_tpi_tcp))
1457 return (0);
1458
1459 if (so->so_state & SS_ISCONNECTED)
1460 return (EINVAL);
1461
1462 mutex_enter(&so->so_lock);
1463 so_lock_single(so); /* Set SOLOCKED */
1464
1465 /*
1466 * If the listen doesn't change the backlog we do nothing.
1467 * This avoids an EPROTO error from the transport.
1468 */
1469 if ((so->so_state & SS_ACCEPTCONN) &&
1470 so->so_backlog == backlog)
1471 goto done;
1472
1473 if (!(so->so_state & SS_ISBOUND)) {
1474 /*
1475 * Must have been explicitly bound in the UNIX domain.
1476 */
1477 if (so->so_family == AF_UNIX) {
1478 error = EINVAL;
1479 goto done;
1480 }
1481 error = sotpi_bindlisten(so, NULL, 0, backlog,
1482 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1483 } else if (backlog > 0) {
1484 /*
1485 * AF_INET{,6} hack to avoid losing the port.
1486 * Assumes that all AF_INET{,6} transports can handle a
1487 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1488 * has already bound thus it is possible to avoid the unbind.
1489 */
1490 if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1491 /*CONSTCOND*/
1492 !solisten_tpi_tcp)) {
1493 error = sotpi_unbind(so, _SOUNBIND_REBIND);
1494 if (error)
1495 goto done;
1496 }
1497 error = sotpi_bindlisten(so, NULL, 0, backlog,
1498 _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1499 } else {
1500 so->so_state |= SS_ACCEPTCONN;
1501 so->so_backlog = backlog;
1502 }
1503 if (error)
1504 goto done;
1505 ASSERT(so->so_state & SS_ACCEPTCONN);
1506 done:
1507 so_unlock_single(so, SOLOCKED);
1508 mutex_exit(&so->so_lock);
1509 return (error);
1510 }
1511
1512 /*
1513 * Disconnect either a specified seqno or all (-1).
1514 * The former is used on listening sockets only.
1515 *
1516 * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1517 * the current use of sodisconnect(seqno == -1) is only for shutdown
1518 * so there is no point (and potentially incorrect) to unbind.
1519 */
1520 static int
sodisconnect(struct sonode * so,t_scalar_t seqno,int flags)1521 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1522 {
1523 struct T_discon_req discon_req;
1524 int error = 0;
1525 mblk_t *mp;
1526
1527 dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1528 (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1529
1530 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1531 mutex_enter(&so->so_lock);
1532 so_lock_single(so); /* Set SOLOCKED */
1533 } else {
1534 ASSERT(MUTEX_HELD(&so->so_lock));
1535 ASSERT(so->so_flag & SOLOCKED);
1536 }
1537
1538 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1539 error = EINVAL;
1540 eprintsoline(so, error);
1541 goto done;
1542 }
1543
1544 mutex_exit(&so->so_lock);
1545 /*
1546 * Flush the write side (unless this is a listener)
1547 * and then send down a T_DISCON_REQ.
1548 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1549 * and other messages.)
1550 */
1551 if (!(so->so_state & SS_ACCEPTCONN))
1552 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1553
1554 discon_req.PRIM_type = T_DISCON_REQ;
1555 discon_req.SEQ_number = seqno;
1556 mp = soallocproto1(&discon_req, sizeof (discon_req),
1557 0, _ALLOC_SLEEP, CRED());
1558 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1559 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1560 mutex_enter(&so->so_lock);
1561 if (error) {
1562 eprintsoline(so, error);
1563 goto done;
1564 }
1565
1566 error = sowaitokack(so, T_DISCON_REQ);
1567 if (error) {
1568 eprintsoline(so, error);
1569 goto done;
1570 }
1571 /*
1572 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1573 * strsock_proto while the lock was dropped above, the disconnect
1574 * is allowed to complete. However, it is not possible to
1575 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1576 */
1577 so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1578 SOTOTPI(so)->sti_laddr_valid = 0;
1579 SOTOTPI(so)->sti_faddr_valid = 0;
1580 done:
1581 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1582 so_unlock_single(so, SOLOCKED);
1583 mutex_exit(&so->so_lock);
1584 } else {
1585 /* If the caller held the lock don't release it here */
1586 ASSERT(MUTEX_HELD(&so->so_lock));
1587 ASSERT(so->so_flag & SOLOCKED);
1588 }
1589 return (error);
1590 }
1591
1592 /* ARGSUSED */
1593 int
sotpi_accept(struct sonode * so,int fflag,struct cred * cr,struct sonode ** nsop)1594 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1595 struct sonode **nsop)
1596 {
1597 struct T_conn_ind *conn_ind;
1598 struct T_conn_res *conn_res;
1599 int error = 0;
1600 mblk_t *mp, *ack_mp;
1601 struct sonode *nso;
1602 vnode_t *nvp;
1603 void *src;
1604 t_uscalar_t srclen;
1605 void *opt;
1606 t_uscalar_t optlen;
1607 t_scalar_t PRIM_type;
1608 t_scalar_t SEQ_number;
1609 size_t sinlen;
1610 sotpi_info_t *sti = SOTOTPI(so);
1611 sotpi_info_t *nsti;
1612
1613 dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1614 (void *)so, fflag, (void *)nsop,
1615 pr_state(so->so_state, so->so_mode)));
1616
1617 /*
1618 * Defer single-threading the accepting socket until
1619 * the T_CONN_IND has been received and parsed and the
1620 * new sonode has been opened.
1621 */
1622
1623 /* Check that we are not already connected */
1624 if ((so->so_state & SS_ACCEPTCONN) == 0)
1625 goto conn_bad;
1626
1627 if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1628 goto e_bad;
1629
1630 ASSERT(mp != NULL);
1631 conn_ind = (struct T_conn_ind *)mp->b_rptr;
1632
1633 /*
1634 * Save SEQ_number for error paths.
1635 */
1636 SEQ_number = conn_ind->SEQ_number;
1637
1638 srclen = conn_ind->SRC_length;
1639 src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1640 if (src == NULL) {
1641 error = EPROTO;
1642 freemsg(mp);
1643 eprintsoline(so, error);
1644 goto disconnect_unlocked;
1645 }
1646 optlen = conn_ind->OPT_length;
1647 switch (so->so_family) {
1648 case AF_INET:
1649 case AF_INET6:
1650 if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1651 bcopy(mp->b_rptr + conn_ind->OPT_offset,
1652 &opt, conn_ind->OPT_length);
1653 } else {
1654 /*
1655 * The transport (in this case TCP) hasn't sent up
1656 * a pointer to an instance for the accept fast-path.
1657 * Disable fast-path completely because the call to
1658 * sotpi_create() below would otherwise create an
1659 * incomplete TCP instance, which would lead to
1660 * problems when sockfs sends a normal T_CONN_RES
1661 * message down the new stream.
1662 */
1663 if (sti->sti_direct) {
1664 int rval;
1665 /*
1666 * For consistency we inform tcp to disable
1667 * direct interface on the listener, though
1668 * we can certainly live without doing this
1669 * because no data will ever travel upstream
1670 * on the listening socket.
1671 */
1672 sti->sti_direct = 0;
1673 (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1674 0, 0, K_TO_K, cr, &rval);
1675 }
1676 opt = NULL;
1677 optlen = 0;
1678 }
1679 break;
1680 case AF_UNIX:
1681 default:
1682 if (optlen != 0) {
1683 opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1684 __TPI_ALIGN_SIZE);
1685 if (opt == NULL) {
1686 error = EPROTO;
1687 freemsg(mp);
1688 eprintsoline(so, error);
1689 goto disconnect_unlocked;
1690 }
1691 }
1692 if (so->so_family == AF_UNIX) {
1693 if (!sti->sti_faddr_noxlate) {
1694 src = NULL;
1695 srclen = 0;
1696 }
1697 /* Extract src address from options */
1698 if (optlen != 0)
1699 so_getopt_srcaddr(opt, optlen, &src, &srclen);
1700 }
1701 break;
1702 }
1703
1704 /*
1705 * Create the new socket.
1706 */
1707 nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1708 if (nso == NULL) {
1709 ASSERT(error != 0);
1710 /*
1711 * Accept can not fail with ENOBUFS. sotpi_create
1712 * sleeps waiting for memory until a signal is caught
1713 * so return EINTR.
1714 */
1715 freemsg(mp);
1716 if (error == ENOBUFS)
1717 error = EINTR;
1718 goto e_disc_unl;
1719 }
1720 nvp = SOTOV(nso);
1721 nsti = SOTOTPI(nso);
1722
1723 #ifdef DEBUG
1724 /*
1725 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1726 * it's inherited early to allow debugging of the accept code itself.
1727 */
1728 nso->so_options |= so->so_options & SO_DEBUG;
1729 #endif /* DEBUG */
1730
1731 /*
1732 * Save the SRC address from the T_CONN_IND
1733 * for getpeername to work on AF_UNIX and on transports that do not
1734 * support TI_GETPEERNAME.
1735 *
1736 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1737 * copyin_name().
1738 */
1739 if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1740 error = EINVAL;
1741 freemsg(mp);
1742 eprintsoline(so, error);
1743 goto disconnect_vp_unlocked;
1744 }
1745 nsti->sti_faddr_len = (socklen_t)srclen;
1746 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1747 bcopy(src, nsti->sti_faddr_sa, srclen);
1748 nsti->sti_faddr_valid = 1;
1749
1750 /*
1751 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1752 */
1753 if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1754 (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1755 cred_t *cr;
1756 pid_t cpid;
1757
1758 cr = msg_getcred(mp, &cpid);
1759 if (cr != NULL) {
1760 crhold(cr);
1761 nso->so_peercred = cr;
1762 nso->so_cpid = cpid;
1763 }
1764 freemsg(mp);
1765
1766 mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1767 sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1768 if (mp == NULL) {
1769 /*
1770 * Accept can not fail with ENOBUFS.
1771 * A signal was caught so return EINTR.
1772 */
1773 error = EINTR;
1774 eprintsoline(so, error);
1775 goto disconnect_vp_unlocked;
1776 }
1777 conn_res = (struct T_conn_res *)mp->b_rptr;
1778 } else {
1779 /*
1780 * For efficency reasons we use msg_extractcred; no crhold
1781 * needed since db_credp is cleared (i.e., we move the cred
1782 * from the message to so_peercred.
1783 */
1784 nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1785
1786 mp->b_rptr = DB_BASE(mp);
1787 conn_res = (struct T_conn_res *)mp->b_rptr;
1788 mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1789
1790 mblk_setcred(mp, cr, curproc->p_pid);
1791 }
1792
1793 /*
1794 * New socket must be bound at least in sockfs and, except for AF_INET,
1795 * (or AF_INET6) it also has to be bound in the transport provider.
1796 * We set the local address in the sonode from the T_OK_ACK of the
1797 * T_CONN_RES. For this reason the address we bind to here isn't
1798 * important.
1799 */
1800 if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1801 /*CONSTCOND*/
1802 nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1803 /*
1804 * Optimization for AF_INET{,6} transports
1805 * that can handle a T_CONN_RES without being bound.
1806 */
1807 mutex_enter(&nso->so_lock);
1808 so_automatic_bind(nso);
1809 mutex_exit(&nso->so_lock);
1810 } else {
1811 /* Perform NULL bind with the transport provider. */
1812 if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1813 cr)) != 0) {
1814 ASSERT(error != ENOBUFS);
1815 freemsg(mp);
1816 eprintsoline(nso, error);
1817 goto disconnect_vp_unlocked;
1818 }
1819 }
1820
1821 /*
1822 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1823 * so that any data arriving on the new socket will cause the
1824 * appropriate signals to be delivered for the new socket.
1825 *
1826 * No other thread (except strsock_proto and strsock_misc)
1827 * can access the new socket thus we relax the locking.
1828 */
1829 nso->so_pgrp = so->so_pgrp;
1830 nso->so_state |= so->so_state & SS_ASYNC;
1831 nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1832
1833 if (nso->so_pgrp != 0) {
1834 if ((error = so_set_events(nso, nvp, cr)) != 0) {
1835 eprintsoline(nso, error);
1836 error = 0;
1837 nso->so_pgrp = 0;
1838 }
1839 }
1840
1841 /*
1842 * Make note of the socket level options. TCP and IP level options
1843 * are already inherited. We could do all this after accept is
1844 * successful but doing it here simplifies code and no harm done
1845 * for error case.
1846 */
1847 nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1848 SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1849 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1850 nso->so_sndbuf = so->so_sndbuf;
1851 nso->so_rcvbuf = so->so_rcvbuf;
1852 if (nso->so_options & SO_LINGER)
1853 nso->so_linger = so->so_linger;
1854
1855 /*
1856 * Note that the following sti_direct code path should be
1857 * removed once we are confident that the direct sockets
1858 * do not result in any degradation.
1859 */
1860 if (sti->sti_direct) {
1861
1862 ASSERT(opt != NULL);
1863
1864 conn_res->OPT_length = optlen;
1865 conn_res->OPT_offset = MBLKL(mp);
1866 bcopy(&opt, mp->b_wptr, optlen);
1867 mp->b_wptr += optlen;
1868 conn_res->PRIM_type = T_CONN_RES;
1869 conn_res->ACCEPTOR_id = 0;
1870 PRIM_type = T_CONN_RES;
1871
1872 /* Send down the T_CONN_RES on acceptor STREAM */
1873 error = kstrputmsg(SOTOV(nso), mp, NULL,
1874 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1875 if (error) {
1876 mutex_enter(&so->so_lock);
1877 so_lock_single(so);
1878 eprintsoline(so, error);
1879 goto disconnect_vp;
1880 }
1881 mutex_enter(&nso->so_lock);
1882 error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1883 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1884 if (error) {
1885 mutex_exit(&nso->so_lock);
1886 mutex_enter(&so->so_lock);
1887 so_lock_single(so);
1888 eprintsoline(so, error);
1889 goto disconnect_vp;
1890 }
1891 if (nso->so_family == AF_INET) {
1892 sin_t *sin;
1893
1894 sin = (sin_t *)(ack_mp->b_rptr +
1895 sizeof (struct T_ok_ack));
1896 bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1897 nsti->sti_laddr_len = sizeof (sin_t);
1898 } else {
1899 sin6_t *sin6;
1900
1901 sin6 = (sin6_t *)(ack_mp->b_rptr +
1902 sizeof (struct T_ok_ack));
1903 bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1904 nsti->sti_laddr_len = sizeof (sin6_t);
1905 }
1906 freemsg(ack_mp);
1907
1908 nso->so_state |= SS_ISCONNECTED;
1909 nso->so_proto_handle = (sock_lower_handle_t)opt;
1910 nsti->sti_laddr_valid = 1;
1911
1912 mutex_exit(&nso->so_lock);
1913
1914 /*
1915 * It's possible, through the use of autopush for example,
1916 * that the acceptor stream may not support sti_direct
1917 * semantics. If the new socket does not support sti_direct
1918 * we issue a _SIOCSOCKFALLBACK to inform the transport
1919 * as we would in the I_PUSH case.
1920 */
1921 if (nsti->sti_direct == 0) {
1922 int rval;
1923
1924 if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
1925 0, 0, K_TO_K, cr, &rval)) != 0) {
1926 mutex_enter(&so->so_lock);
1927 so_lock_single(so);
1928 eprintsoline(so, error);
1929 goto disconnect_vp;
1930 }
1931 }
1932
1933 /*
1934 * Pass out new socket.
1935 */
1936 if (nsop != NULL)
1937 *nsop = nso;
1938
1939 return (0);
1940 }
1941
1942 /*
1943 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1944 * which don't support the FireEngine accept fast-path. It is also
1945 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1946 * again. Neither sockfs nor TCP attempt to find out if some other
1947 * random module has been inserted in between (in which case we
1948 * should follow TLI accept behaviour). We blindly assume the worst
1949 * case and revert back to old behaviour i.e. TCP will not send us
1950 * any option (eager) and the accept should happen on the listener
1951 * queue. Any queued T_conn_ind have already got their options removed
1952 * by so_sock2_stream() when "sockmod" was I_POP'd.
1953 */
1954 /*
1955 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1956 */
1957 if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1958 #ifdef _ILP32
1959 queue_t *q;
1960
1961 /*
1962 * Find read queue in driver
1963 * Can safely do this since we "own" nso/nvp.
1964 */
1965 q = strvp2wq(nvp)->q_next;
1966 while (SAMESTR(q))
1967 q = q->q_next;
1968 q = RD(q);
1969 conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1970 #else
1971 conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1972 #endif /* _ILP32 */
1973 conn_res->PRIM_type = O_T_CONN_RES;
1974 PRIM_type = O_T_CONN_RES;
1975 } else {
1976 conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
1977 conn_res->PRIM_type = T_CONN_RES;
1978 PRIM_type = T_CONN_RES;
1979 }
1980 conn_res->SEQ_number = SEQ_number;
1981 conn_res->OPT_length = 0;
1982 conn_res->OPT_offset = 0;
1983
1984 mutex_enter(&so->so_lock);
1985 so_lock_single(so); /* Set SOLOCKED */
1986 mutex_exit(&so->so_lock);
1987
1988 error = kstrputmsg(SOTOV(so), mp, NULL,
1989 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1990 mutex_enter(&so->so_lock);
1991 if (error) {
1992 eprintsoline(so, error);
1993 goto disconnect_vp;
1994 }
1995 error = sowaitprim(so, PRIM_type, T_OK_ACK,
1996 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1997 if (error) {
1998 eprintsoline(so, error);
1999 goto disconnect_vp;
2000 }
2001 mutex_exit(&so->so_lock);
2002 /*
2003 * If there is a sin/sin6 appended onto the T_OK_ACK use
2004 * that to set the local address. If this is not present
2005 * then we zero out the address and don't set the
2006 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2007 * the pathname from the listening socket.
2008 * In the case where this is TCP or an AF_UNIX socket the
2009 * client side may have queued data or a T_ORDREL in the
2010 * transport. Having now sent the T_CONN_RES we may receive
2011 * those queued messages at any time. Hold the acceptor
2012 * so_lock until its state and laddr are finalized.
2013 */
2014 mutex_enter(&nso->so_lock);
2015 sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2016 if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
2017 MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2018 ack_mp->b_rptr += sizeof (struct T_ok_ack);
2019 bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2020 nsti->sti_laddr_len = sinlen;
2021 nsti->sti_laddr_valid = 1;
2022 } else if (nso->so_family == AF_UNIX) {
2023 ASSERT(so->so_family == AF_UNIX);
2024 nsti->sti_laddr_len = sti->sti_laddr_len;
2025 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2026 bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2027 nsti->sti_laddr_len);
2028 nsti->sti_laddr_valid = 1;
2029 } else {
2030 nsti->sti_laddr_len = sti->sti_laddr_len;
2031 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2032 bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2033 nsti->sti_laddr_sa->sa_family = nso->so_family;
2034 }
2035 nso->so_state |= SS_ISCONNECTED;
2036 mutex_exit(&nso->so_lock);
2037
2038 freemsg(ack_mp);
2039
2040 mutex_enter(&so->so_lock);
2041 so_unlock_single(so, SOLOCKED);
2042 mutex_exit(&so->so_lock);
2043
2044 /*
2045 * Pass out new socket.
2046 */
2047 if (nsop != NULL)
2048 *nsop = nso;
2049
2050 return (0);
2051
2052 e_disc_unl:
2053 eprintsoline(so, error);
2054 goto disconnect_unlocked;
2055
2056 disconnect_vp_unlocked:
2057 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2058 VN_RELE(nvp);
2059 disconnect_unlocked:
2060 (void) sodisconnect(so, SEQ_number, 0);
2061 return (error);
2062
2063 disconnect_vp:
2064 (void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2065 so_unlock_single(so, SOLOCKED);
2066 mutex_exit(&so->so_lock);
2067 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2068 VN_RELE(nvp);
2069 return (error);
2070
2071 conn_bad: /* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2072 error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2073 ? EOPNOTSUPP : EINVAL;
2074 e_bad:
2075 eprintsoline(so, error);
2076 return (error);
2077 }
2078
2079 /*
2080 * connect a socket.
2081 *
2082 * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2083 * unconnect (by specifying a null address).
2084 */
2085 int
sotpi_connect(struct sonode * so,struct sockaddr * name,socklen_t namelen,int fflag,int flags,struct cred * cr)2086 sotpi_connect(struct sonode *so,
2087 struct sockaddr *name,
2088 socklen_t namelen,
2089 int fflag,
2090 int flags,
2091 struct cred *cr)
2092 {
2093 struct T_conn_req conn_req;
2094 int error = 0;
2095 mblk_t *mp;
2096 void *src;
2097 socklen_t srclen;
2098 void *addr;
2099 socklen_t addrlen;
2100 boolean_t need_unlock;
2101 sotpi_info_t *sti = SOTOTPI(so);
2102
2103 dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2104 (void *)so, (void *)name, namelen, fflag, flags,
2105 pr_state(so->so_state, so->so_mode)));
2106
2107 /*
2108 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2109 * avoid sleeping for memory with SOLOCKED held.
2110 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2111 * + sizeof (struct T_opthdr).
2112 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2113 * exceed sti_faddr_maxlen).
2114 */
2115 mp = soallocproto(sizeof (struct T_conn_req) +
2116 2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2117 cr);
2118 if (mp == NULL) {
2119 /*
2120 * Connect can not fail with ENOBUFS. A signal was
2121 * caught so return EINTR.
2122 */
2123 error = EINTR;
2124 eprintsoline(so, error);
2125 return (error);
2126 }
2127
2128 mutex_enter(&so->so_lock);
2129 /*
2130 * Make sure there is a preallocated T_unbind_req message
2131 * before any binding. This message is allocated when the
2132 * socket is created. Since another thread can consume
2133 * so_unbind_mp by the time we return from so_lock_single(),
2134 * we should check the availability of so_unbind_mp after
2135 * we return from so_lock_single().
2136 */
2137
2138 so_lock_single(so); /* Set SOLOCKED */
2139 need_unlock = B_TRUE;
2140
2141 if (sti->sti_unbind_mp == NULL) {
2142 dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2143 /* NOTE: holding so_lock while sleeping */
2144 sti->sti_unbind_mp =
2145 soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2146 if (sti->sti_unbind_mp == NULL) {
2147 error = EINTR;
2148 goto done;
2149 }
2150 }
2151
2152 /*
2153 * Can't have done a listen before connecting.
2154 */
2155 if (so->so_state & SS_ACCEPTCONN) {
2156 error = EOPNOTSUPP;
2157 goto done;
2158 }
2159
2160 /*
2161 * Must be bound with the transport
2162 */
2163 if (!(so->so_state & SS_ISBOUND)) {
2164 if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2165 /*CONSTCOND*/
2166 so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2167 /*
2168 * Optimization for AF_INET{,6} transports
2169 * that can handle a T_CONN_REQ without being bound.
2170 */
2171 so_automatic_bind(so);
2172 } else {
2173 error = sotpi_bind(so, NULL, 0,
2174 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2175 if (error)
2176 goto done;
2177 }
2178 ASSERT(so->so_state & SS_ISBOUND);
2179 flags |= _SOCONNECT_DID_BIND;
2180 }
2181
2182 /*
2183 * Handle a connect to a name parameter of type AF_UNSPEC like a
2184 * connect to a null address. This is the portable method to
2185 * unconnect a socket.
2186 */
2187 if ((namelen >= sizeof (sa_family_t)) &&
2188 (name->sa_family == AF_UNSPEC)) {
2189 name = NULL;
2190 namelen = 0;
2191 }
2192
2193 /*
2194 * Check that we are not already connected.
2195 * A connection-oriented socket cannot be reconnected.
2196 * A connected connection-less socket can be
2197 * - connected to a different address by a subsequent connect
2198 * - "unconnected" by a connect to the NULL address
2199 */
2200 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2201 ASSERT(!(flags & _SOCONNECT_DID_BIND));
2202 if (so->so_mode & SM_CONNREQUIRED) {
2203 /* Connection-oriented socket */
2204 error = so->so_state & SS_ISCONNECTED ?
2205 EISCONN : EALREADY;
2206 goto done;
2207 }
2208 /* Connection-less socket */
2209 if (name == NULL) {
2210 /*
2211 * Remove the connected state and clear SO_DGRAM_ERRIND
2212 * since it was set when the socket was connected.
2213 * If this is UDP also send down a T_DISCON_REQ.
2214 */
2215 int val;
2216
2217 if ((so->so_family == AF_INET ||
2218 so->so_family == AF_INET6) &&
2219 (so->so_type == SOCK_DGRAM ||
2220 so->so_type == SOCK_RAW) &&
2221 /*CONSTCOND*/
2222 !soconnect_tpi_udp) {
2223 /* XXX What about implicitly unbinding here? */
2224 error = sodisconnect(so, -1,
2225 _SODISCONNECT_LOCK_HELD);
2226 } else {
2227 so->so_state &=
2228 ~(SS_ISCONNECTED | SS_ISCONNECTING);
2229 sti->sti_faddr_valid = 0;
2230 sti->sti_faddr_len = 0;
2231 }
2232
2233 /* Remove SOLOCKED since setsockopt will grab it */
2234 so_unlock_single(so, SOLOCKED);
2235 mutex_exit(&so->so_lock);
2236
2237 val = 0;
2238 (void) sotpi_setsockopt(so, SOL_SOCKET,
2239 SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2240 cr);
2241
2242 mutex_enter(&so->so_lock);
2243 so_lock_single(so); /* Set SOLOCKED */
2244 goto done;
2245 }
2246 }
2247 ASSERT(so->so_state & SS_ISBOUND);
2248
2249 if (name == NULL || namelen == 0) {
2250 error = EINVAL;
2251 goto done;
2252 }
2253 /*
2254 * Mark the socket if sti_faddr_sa represents the transport level
2255 * address.
2256 */
2257 if (flags & _SOCONNECT_NOXLATE) {
2258 struct sockaddr_ux *soaddr_ux;
2259
2260 ASSERT(so->so_family == AF_UNIX);
2261 if (namelen != sizeof (struct sockaddr_ux)) {
2262 error = EINVAL;
2263 goto done;
2264 }
2265 soaddr_ux = (struct sockaddr_ux *)name;
2266 name = (struct sockaddr *)&soaddr_ux->sou_addr;
2267 namelen = sizeof (soaddr_ux->sou_addr);
2268 sti->sti_faddr_noxlate = 1;
2269 }
2270
2271 /*
2272 * Length and family checks.
2273 */
2274 error = so_addr_verify(so, name, namelen);
2275 if (error)
2276 goto bad;
2277
2278 /*
2279 * Save foreign address. Needed for AF_UNIX as well as
2280 * transport providers that do not support TI_GETPEERNAME.
2281 * Also used for cached foreign address for TCP and UDP.
2282 */
2283 if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2284 error = EINVAL;
2285 goto done;
2286 }
2287 sti->sti_faddr_len = (socklen_t)namelen;
2288 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2289 bcopy(name, sti->sti_faddr_sa, namelen);
2290 sti->sti_faddr_valid = 1;
2291
2292 if (so->so_family == AF_UNIX) {
2293 if (sti->sti_faddr_noxlate) {
2294 /*
2295 * sti_faddr is a transport-level address, so
2296 * don't pass it as an option. Do save it in
2297 * sti_ux_faddr, used for connected DG send.
2298 */
2299 src = NULL;
2300 srclen = 0;
2301 addr = sti->sti_faddr_sa;
2302 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2303 bcopy(addr, &sti->sti_ux_faddr,
2304 sizeof (sti->sti_ux_faddr));
2305 } else {
2306 /*
2307 * Pass the sockaddr_un source address as an option
2308 * and translate the remote address.
2309 * Holding so_lock thus sti_laddr_sa can not change.
2310 */
2311 src = sti->sti_laddr_sa;
2312 srclen = (t_uscalar_t)sti->sti_laddr_len;
2313 dprintso(so, 1,
2314 ("sotpi_connect UNIX: srclen %d, src %p\n",
2315 srclen, src));
2316 /*
2317 * Translate the destination address into our
2318 * internal form, and save it in sti_ux_faddr.
2319 * After this call, addr==&sti->sti_ux_taddr,
2320 * and we copy that to sti->sti_ux_faddr so
2321 * we save the connected peer address.
2322 */
2323 error = so_ux_addr_xlate(so,
2324 sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2325 (flags & _SOCONNECT_XPG4_2),
2326 &addr, &addrlen);
2327 if (error)
2328 goto bad;
2329 bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2330 sizeof (sti->sti_ux_faddr));
2331 }
2332 } else {
2333 addr = sti->sti_faddr_sa;
2334 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2335 src = NULL;
2336 srclen = 0;
2337 }
2338 /*
2339 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2340 * option which asks the transport provider to send T_UDERR_IND
2341 * messages. These T_UDERR_IND messages are used to return connected
2342 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2343 *
2344 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2345 * we send down a T_CONN_REQ. This is needed to let the
2346 * transport assign a local address that is consistent with
2347 * the remote address. Applications depend on a getsockname()
2348 * after a connect() to retrieve the "source" IP address for
2349 * the connected socket. Invalidate the cached local address
2350 * to force getsockname() to enquire of the transport.
2351 */
2352 if (!(so->so_mode & SM_CONNREQUIRED)) {
2353 /*
2354 * Datagram socket.
2355 */
2356 int32_t val;
2357
2358 so_unlock_single(so, SOLOCKED);
2359 mutex_exit(&so->so_lock);
2360
2361 val = 1;
2362 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2363 &val, (t_uscalar_t)sizeof (val), cr);
2364
2365 mutex_enter(&so->so_lock);
2366 so_lock_single(so); /* Set SOLOCKED */
2367 if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2368 (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2369 soconnect_tpi_udp) {
2370 soisconnected(so);
2371 goto done;
2372 }
2373 /*
2374 * Send down T_CONN_REQ etc.
2375 * Clear fflag to avoid returning EWOULDBLOCK.
2376 */
2377 fflag = 0;
2378 ASSERT(so->so_family != AF_UNIX);
2379 sti->sti_laddr_valid = 0;
2380 } else if (sti->sti_laddr_len != 0) {
2381 /*
2382 * If the local address or port was "any" then it may be
2383 * changed by the transport as a result of the
2384 * connect. Invalidate the cached version if we have one.
2385 */
2386 switch (so->so_family) {
2387 case AF_INET:
2388 ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2389 if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2390 INADDR_ANY ||
2391 ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2392 sti->sti_laddr_valid = 0;
2393 break;
2394
2395 case AF_INET6:
2396 ASSERT(sti->sti_laddr_len ==
2397 (socklen_t)sizeof (sin6_t));
2398 if (IN6_IS_ADDR_UNSPECIFIED(
2399 &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2400 IN6_IS_ADDR_V4MAPPED_ANY(
2401 &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2402 ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2403 sti->sti_laddr_valid = 0;
2404 break;
2405
2406 default:
2407 break;
2408 }
2409 }
2410
2411 /*
2412 * Check for failure of an earlier call
2413 */
2414 if (so->so_error != 0)
2415 goto so_bad;
2416
2417 /*
2418 * Send down T_CONN_REQ. Message was allocated above.
2419 */
2420 conn_req.PRIM_type = T_CONN_REQ;
2421 conn_req.DEST_length = addrlen;
2422 conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2423 if (srclen == 0) {
2424 conn_req.OPT_length = 0;
2425 conn_req.OPT_offset = 0;
2426 soappendmsg(mp, &conn_req, sizeof (conn_req));
2427 soappendmsg(mp, addr, addrlen);
2428 } else {
2429 /*
2430 * There is a AF_UNIX sockaddr_un to include as a source
2431 * address option.
2432 */
2433 struct T_opthdr toh;
2434
2435 toh.level = SOL_SOCKET;
2436 toh.name = SO_SRCADDR;
2437 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2438 toh.status = 0;
2439 conn_req.OPT_length =
2440 (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2441 conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2442 _TPI_ALIGN_TOPT(addrlen));
2443
2444 soappendmsg(mp, &conn_req, sizeof (conn_req));
2445 soappendmsg(mp, addr, addrlen);
2446 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2447 soappendmsg(mp, &toh, sizeof (toh));
2448 soappendmsg(mp, src, srclen);
2449 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2450 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2451 }
2452 /*
2453 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2454 * in order to have the right state when the T_CONN_CON shows up.
2455 */
2456 soisconnecting(so);
2457 mutex_exit(&so->so_lock);
2458
2459 if (AU_AUDITING())
2460 audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2461
2462 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2463 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2464 mp = NULL;
2465 mutex_enter(&so->so_lock);
2466 if (error != 0)
2467 goto bad;
2468
2469 if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2470 goto bad;
2471
2472 /* Allow other threads to access the socket */
2473 so_unlock_single(so, SOLOCKED);
2474 need_unlock = B_FALSE;
2475
2476 /*
2477 * Wait until we get a T_CONN_CON or an error
2478 */
2479 if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2480 so_lock_single(so); /* Set SOLOCKED */
2481 need_unlock = B_TRUE;
2482 }
2483
2484 done:
2485 freemsg(mp);
2486 switch (error) {
2487 case EINPROGRESS:
2488 case EALREADY:
2489 case EISCONN:
2490 case EINTR:
2491 /* Non-fatal errors */
2492 sti->sti_laddr_valid = 0;
2493 /* FALLTHRU */
2494 case 0:
2495 break;
2496 default:
2497 ASSERT(need_unlock);
2498 /*
2499 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2500 * and invalidate local-address cache
2501 */
2502 so->so_state &= ~SS_ISCONNECTING;
2503 sti->sti_laddr_valid = 0;
2504 /* A discon_ind might have already unbound us */
2505 if ((flags & _SOCONNECT_DID_BIND) &&
2506 (so->so_state & SS_ISBOUND)) {
2507 int err;
2508
2509 err = sotpi_unbind(so, 0);
2510 /* LINTED - statement has no conseq */
2511 if (err) {
2512 eprintsoline(so, err);
2513 }
2514 }
2515 break;
2516 }
2517 if (need_unlock)
2518 so_unlock_single(so, SOLOCKED);
2519 mutex_exit(&so->so_lock);
2520 return (error);
2521
2522 so_bad: error = sogeterr(so, B_TRUE);
2523 bad: eprintsoline(so, error);
2524 goto done;
2525 }
2526
2527 /* ARGSUSED */
2528 int
sotpi_shutdown(struct sonode * so,int how,struct cred * cr)2529 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2530 {
2531 struct T_ordrel_req ordrel_req;
2532 mblk_t *mp;
2533 uint_t old_state, state_change;
2534 int error = 0;
2535 sotpi_info_t *sti = SOTOTPI(so);
2536
2537 dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2538 (void *)so, how, pr_state(so->so_state, so->so_mode)));
2539
2540 mutex_enter(&so->so_lock);
2541 so_lock_single(so); /* Set SOLOCKED */
2542
2543 /*
2544 * SunOS 4.X has no check for datagram sockets.
2545 * 5.X checks that it is connected (ENOTCONN)
2546 * X/Open requires that we check the connected state.
2547 */
2548 if (!(so->so_state & SS_ISCONNECTED)) {
2549 if (!xnet_skip_checks) {
2550 error = ENOTCONN;
2551 if (xnet_check_print) {
2552 printf("sockfs: X/Open shutdown check "
2553 "caused ENOTCONN\n");
2554 }
2555 }
2556 goto done;
2557 }
2558 /*
2559 * Record the current state and then perform any state changes.
2560 * Then use the difference between the old and new states to
2561 * determine which messages need to be sent.
2562 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2563 * duplicate calls to shutdown().
2564 */
2565 old_state = so->so_state;
2566
2567 switch (how) {
2568 case 0:
2569 socantrcvmore(so);
2570 break;
2571 case 1:
2572 socantsendmore(so);
2573 break;
2574 case 2:
2575 socantsendmore(so);
2576 socantrcvmore(so);
2577 break;
2578 default:
2579 error = EINVAL;
2580 goto done;
2581 }
2582
2583 /*
2584 * Assumes that the SS_CANT* flags are never cleared in the above code.
2585 */
2586 state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2587 (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2588 ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2589
2590 switch (state_change) {
2591 case 0:
2592 dprintso(so, 1,
2593 ("sotpi_shutdown: nothing to send in state 0x%x\n",
2594 so->so_state));
2595 goto done;
2596
2597 case SS_CANTRCVMORE:
2598 mutex_exit(&so->so_lock);
2599 strseteof(SOTOV(so), 1);
2600 /*
2601 * strseteof takes care of read side wakeups,
2602 * pollwakeups, and signals.
2603 */
2604 /*
2605 * Get the read lock before flushing data to avoid problems
2606 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2607 */
2608 mutex_enter(&so->so_lock);
2609 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2610 mutex_exit(&so->so_lock);
2611
2612 /* Flush read side queue */
2613 strflushrq(SOTOV(so), FLUSHALL);
2614
2615 mutex_enter(&so->so_lock);
2616 so_unlock_read(so); /* Clear SOREADLOCKED */
2617 break;
2618
2619 case SS_CANTSENDMORE:
2620 mutex_exit(&so->so_lock);
2621 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2622 mutex_enter(&so->so_lock);
2623 break;
2624
2625 case SS_CANTSENDMORE|SS_CANTRCVMORE:
2626 mutex_exit(&so->so_lock);
2627 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2628 strseteof(SOTOV(so), 1);
2629 /*
2630 * strseteof takes care of read side wakeups,
2631 * pollwakeups, and signals.
2632 */
2633 /*
2634 * Get the read lock before flushing data to avoid problems
2635 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2636 */
2637 mutex_enter(&so->so_lock);
2638 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2639 mutex_exit(&so->so_lock);
2640
2641 /* Flush read side queue */
2642 strflushrq(SOTOV(so), FLUSHALL);
2643
2644 mutex_enter(&so->so_lock);
2645 so_unlock_read(so); /* Clear SOREADLOCKED */
2646 break;
2647 }
2648
2649 ASSERT(MUTEX_HELD(&so->so_lock));
2650
2651 /*
2652 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2653 * was set due to this call and the new state has both of them set:
2654 * Send the AF_UNIX close indication
2655 * For T_COTS send a discon_ind
2656 *
2657 * If cantsend was set due to this call:
2658 * For T_COTSORD send an ordrel_ind
2659 *
2660 * Note that for T_CLTS there is no message sent here.
2661 */
2662 if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2663 (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2664 /*
2665 * For SunOS 4.X compatibility we tell the other end
2666 * that we are unable to receive at this point.
2667 */
2668 if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2669 so_unix_close(so);
2670
2671 if (sti->sti_serv_type == T_COTS)
2672 error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2673 }
2674 if ((state_change & SS_CANTSENDMORE) &&
2675 (sti->sti_serv_type == T_COTS_ORD)) {
2676 /* Send an orderly release */
2677 ordrel_req.PRIM_type = T_ORDREL_REQ;
2678
2679 mutex_exit(&so->so_lock);
2680 mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2681 0, _ALLOC_SLEEP, cr);
2682 /*
2683 * Send down the T_ORDREL_REQ even if there is flow control.
2684 * This prevents shutdown from blocking.
2685 * Note that there is no T_OK_ACK for ordrel_req.
2686 */
2687 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2688 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2689 mutex_enter(&so->so_lock);
2690 if (error) {
2691 eprintsoline(so, error);
2692 goto done;
2693 }
2694 }
2695
2696 done:
2697 so_unlock_single(so, SOLOCKED);
2698 mutex_exit(&so->so_lock);
2699 return (error);
2700 }
2701
2702 /*
2703 * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2704 * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2705 * that we have closed.
2706 * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2707 * T_UNITDATA_REQ containing the same option.
2708 *
2709 * For SOCK_DGRAM half-connections (somebody connected to this end
2710 * but this end is not connect) we don't know where to send any
2711 * SO_UNIX_CLOSE.
2712 *
2713 * We have to ignore stream head errors just in case there has been
2714 * a shutdown(output).
2715 * Ignore any flow control to try to get the message more quickly to the peer.
2716 * While locally ignoring flow control solves the problem when there
2717 * is only the loopback transport on the stream it would not provide
2718 * the correct AF_UNIX socket semantics when one or more modules have
2719 * been pushed.
2720 */
2721 void
so_unix_close(struct sonode * so)2722 so_unix_close(struct sonode *so)
2723 {
2724 struct T_opthdr toh;
2725 mblk_t *mp;
2726 sotpi_info_t *sti = SOTOTPI(so);
2727
2728 ASSERT(MUTEX_HELD(&so->so_lock));
2729
2730 ASSERT(so->so_family == AF_UNIX);
2731
2732 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2733 (SS_ISCONNECTED|SS_ISBOUND))
2734 return;
2735
2736 dprintso(so, 1, ("so_unix_close(%p) %s\n",
2737 (void *)so, pr_state(so->so_state, so->so_mode)));
2738
2739 toh.level = SOL_SOCKET;
2740 toh.name = SO_UNIX_CLOSE;
2741
2742 /* zero length + header */
2743 toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2744 toh.status = 0;
2745
2746 if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2747 struct T_optdata_req tdr;
2748
2749 tdr.PRIM_type = T_OPTDATA_REQ;
2750 tdr.DATA_flag = 0;
2751
2752 tdr.OPT_length = (t_scalar_t)sizeof (toh);
2753 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2754
2755 /* NOTE: holding so_lock while sleeping */
2756 mp = soallocproto2(&tdr, sizeof (tdr),
2757 &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2758 } else {
2759 struct T_unitdata_req tudr;
2760 void *addr;
2761 socklen_t addrlen;
2762 void *src;
2763 socklen_t srclen;
2764 struct T_opthdr toh2;
2765 t_scalar_t size;
2766
2767 /*
2768 * We know this is an AF_UNIX connected DGRAM socket.
2769 * We therefore already have the destination address
2770 * in the internal form needed for this send. This is
2771 * similar to the sosend_dgram call later in this file
2772 * when there's no user-specified destination address.
2773 */
2774 if (sti->sti_faddr_noxlate) {
2775 /*
2776 * Already have a transport internal address. Do not
2777 * pass any (transport internal) source address.
2778 */
2779 addr = sti->sti_faddr_sa;
2780 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2781 src = NULL;
2782 srclen = 0;
2783 } else {
2784 /*
2785 * Pass the sockaddr_un source address as an option
2786 * and translate the remote address.
2787 * Holding so_lock thus sti_laddr_sa can not change.
2788 */
2789 src = sti->sti_laddr_sa;
2790 srclen = (socklen_t)sti->sti_laddr_len;
2791 dprintso(so, 1,
2792 ("so_ux_close: srclen %d, src %p\n",
2793 srclen, src));
2794 /*
2795 * Use the destination address saved in connect.
2796 */
2797 addr = &sti->sti_ux_faddr;
2798 addrlen = sizeof (sti->sti_ux_faddr);
2799 }
2800 tudr.PRIM_type = T_UNITDATA_REQ;
2801 tudr.DEST_length = addrlen;
2802 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2803 if (srclen == 0) {
2804 tudr.OPT_length = (t_scalar_t)sizeof (toh);
2805 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2806 _TPI_ALIGN_TOPT(addrlen));
2807
2808 size = tudr.OPT_offset + tudr.OPT_length;
2809 /* NOTE: holding so_lock while sleeping */
2810 mp = soallocproto2(&tudr, sizeof (tudr),
2811 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2812 mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2813 soappendmsg(mp, &toh, sizeof (toh));
2814 } else {
2815 /*
2816 * There is a AF_UNIX sockaddr_un to include as a
2817 * source address option.
2818 */
2819 tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2820 _TPI_ALIGN_TOPT(srclen));
2821 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2822 _TPI_ALIGN_TOPT(addrlen));
2823
2824 toh2.level = SOL_SOCKET;
2825 toh2.name = SO_SRCADDR;
2826 toh2.len = (t_uscalar_t)(srclen +
2827 sizeof (struct T_opthdr));
2828 toh2.status = 0;
2829
2830 size = tudr.OPT_offset + tudr.OPT_length;
2831
2832 /* NOTE: holding so_lock while sleeping */
2833 mp = soallocproto2(&tudr, sizeof (tudr),
2834 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2835 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2836 soappendmsg(mp, &toh, sizeof (toh));
2837 soappendmsg(mp, &toh2, sizeof (toh2));
2838 soappendmsg(mp, src, srclen);
2839 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2840 }
2841 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2842 }
2843 mutex_exit(&so->so_lock);
2844 (void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2845 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2846 mutex_enter(&so->so_lock);
2847 }
2848
2849 /*
2850 * Called by sotpi_recvmsg when reading a non-zero amount of data.
2851 * In addition, the caller typically verifies that there is some
2852 * potential state to clear by checking
2853 * if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2854 * before calling this routine.
2855 * Note that such a check can be made without holding so_lock since
2856 * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2857 * decrements sti_oobsigcnt.
2858 *
2859 * When data is read *after* the point that all pending
2860 * oob data has been consumed the oob indication is cleared.
2861 *
2862 * This logic keeps select/poll returning POLLRDBAND and
2863 * SIOCATMARK returning true until we have read past
2864 * the mark.
2865 */
2866 static void
sorecv_update_oobstate(struct sonode * so)2867 sorecv_update_oobstate(struct sonode *so)
2868 {
2869 sotpi_info_t *sti = SOTOTPI(so);
2870
2871 mutex_enter(&so->so_lock);
2872 ASSERT(so_verify_oobstate(so));
2873 dprintso(so, 1,
2874 ("sorecv_update_oobstate: counts %d/%d state %s\n",
2875 sti->sti_oobsigcnt,
2876 sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2877 if (sti->sti_oobsigcnt == 0) {
2878 /* No more pending oob indications */
2879 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2880 freemsg(so->so_oobmsg);
2881 so->so_oobmsg = NULL;
2882 }
2883 ASSERT(so_verify_oobstate(so));
2884 mutex_exit(&so->so_lock);
2885 }
2886
2887 /*
2888 * Receive the next message on the queue.
2889 * If msg_controllen is non-zero when called the caller is interested in
2890 * any received control info (options).
2891 * If msg_namelen is non-zero when called the caller is interested in
2892 * any received source address.
2893 * The routine returns with msg_control and msg_name pointing to
2894 * kmem_alloc'ed memory which the caller has to free.
2895 */
2896 /* ARGSUSED */
2897 int
sotpi_recvmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)2898 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
2899 struct cred *cr)
2900 {
2901 union T_primitives *tpr;
2902 mblk_t *mp;
2903 uchar_t pri;
2904 int pflag, opflag;
2905 void *control;
2906 t_uscalar_t controllen;
2907 t_uscalar_t namelen;
2908 int so_state = so->so_state; /* Snapshot */
2909 ssize_t saved_resid;
2910 rval_t rval;
2911 int flags;
2912 clock_t timout;
2913 int error = 0;
2914 sotpi_info_t *sti = SOTOTPI(so);
2915
2916 flags = msg->msg_flags;
2917 msg->msg_flags = 0;
2918
2919 dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2920 (void *)so, (void *)msg, flags,
2921 pr_state(so->so_state, so->so_mode), so->so_error));
2922
2923 if (so->so_version == SOV_STREAM) {
2924 so_update_attrs(so, SOACC);
2925 /* The imaginary "sockmod" has been popped - act as a stream */
2926 return (strread(SOTOV(so), uiop, cr));
2927 }
2928
2929 /*
2930 * If we are not connected because we have never been connected
2931 * we return ENOTCONN. If we have been connected (but are no longer
2932 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2933 * the EOF.
2934 *
2935 * An alternative would be to post an ENOTCONN error in stream head
2936 * (read+write) and clear it when we're connected. However, that error
2937 * would cause incorrect poll/select behavior!
2938 */
2939 if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2940 (so->so_mode & SM_CONNREQUIRED)) {
2941 return (ENOTCONN);
2942 }
2943
2944 /*
2945 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2946 * after checking that the read queue is empty) and returns zero.
2947 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2948 * is zero.
2949 */
2950
2951 if (flags & MSG_OOB) {
2952 /* Check that the transport supports OOB */
2953 if (!(so->so_mode & SM_EXDATA))
2954 return (EOPNOTSUPP);
2955 so_update_attrs(so, SOACC);
2956 return (sorecvoob(so, msg, uiop, flags,
2957 (so->so_options & SO_OOBINLINE)));
2958 }
2959
2960 so_update_attrs(so, SOACC);
2961
2962 /*
2963 * Set msg_controllen and msg_namelen to zero here to make it
2964 * simpler in the cases that no control or name is returned.
2965 */
2966 controllen = msg->msg_controllen;
2967 namelen = msg->msg_namelen;
2968 msg->msg_controllen = 0;
2969 msg->msg_namelen = 0;
2970
2971 dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2972 namelen, controllen));
2973
2974 mutex_enter(&so->so_lock);
2975 /*
2976 * Only one reader is allowed at any given time. This is needed
2977 * for T_EXDATA handling and, in the future, MSG_WAITALL.
2978 *
2979 * This is slightly different that BSD behavior in that it fails with
2980 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
2981 * is single-threaded using sblock(), which is dropped while waiting
2982 * for data to appear. The difference shows up e.g. if one
2983 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
2984 * does use nonblocking io and different threads are reading each
2985 * file descriptor. In BSD there would never be an EWOULDBLOCK error
2986 * in this case as long as the read queue doesn't get empty.
2987 * In this implementation the thread using nonblocking io can
2988 * get an EWOULDBLOCK error due to the blocking thread executing
2989 * e.g. in the uiomove in kstrgetmsg.
2990 * This difference is not believed to be significant.
2991 */
2992 /* Set SOREADLOCKED */
2993 error = so_lock_read_intr(so,
2994 uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
2995 mutex_exit(&so->so_lock);
2996 if (error)
2997 return (error);
2998
2999 /*
3000 * Tell kstrgetmsg to not inspect the stream head errors until all
3001 * queued data has been consumed.
3002 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3003 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3004 *
3005 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3006 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3007 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3008 */
3009 pflag = MSG_ANY | MSG_DELAYERROR;
3010 if (flags & MSG_PEEK) {
3011 pflag |= MSG_IPEEK;
3012 flags &= ~MSG_WAITALL;
3013 }
3014 if (so->so_mode & SM_ATOMIC)
3015 pflag |= MSG_DISCARDTAIL;
3016
3017 if (flags & MSG_DONTWAIT)
3018 timout = 0;
3019 else if (so->so_rcvtimeo != 0)
3020 timout = TICK_TO_MSEC(so->so_rcvtimeo);
3021 else
3022 timout = -1;
3023 opflag = pflag;
3024 retry:
3025 saved_resid = uiop->uio_resid;
3026 pri = 0;
3027 mp = NULL;
3028 error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3029 timout, &rval);
3030 if (error != 0) {
3031 /* kstrgetmsg returns ETIME when timeout expires */
3032 if (error == ETIME)
3033 error = EWOULDBLOCK;
3034 goto out;
3035 }
3036 /*
3037 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3038 * For non-datagrams MOREDATA is used to set MSG_EOR.
3039 */
3040 ASSERT(!(rval.r_val1 & MORECTL));
3041 if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3042 msg->msg_flags |= MSG_TRUNC;
3043
3044 if (mp == NULL) {
3045 dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3046 /*
3047 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3048 * The draft Posix socket spec states that the mark should
3049 * not be cleared when peeking. We follow the latter.
3050 */
3051 if ((so->so_state &
3052 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3053 (uiop->uio_resid != saved_resid) &&
3054 !(flags & MSG_PEEK)) {
3055 sorecv_update_oobstate(so);
3056 }
3057
3058 mutex_enter(&so->so_lock);
3059 /* Set MSG_EOR based on MOREDATA */
3060 if (!(rval.r_val1 & MOREDATA)) {
3061 if (so->so_state & SS_SAVEDEOR) {
3062 msg->msg_flags |= MSG_EOR;
3063 so->so_state &= ~SS_SAVEDEOR;
3064 }
3065 }
3066 /*
3067 * If some data was received (i.e. not EOF) and the
3068 * read/recv* has not been satisfied wait for some more.
3069 */
3070 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3071 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3072 mutex_exit(&so->so_lock);
3073 pflag = opflag | MSG_NOMARK;
3074 goto retry;
3075 }
3076 goto out_locked;
3077 }
3078
3079 /* strsock_proto has already verified length and alignment */
3080 tpr = (union T_primitives *)mp->b_rptr;
3081 dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3082
3083 switch (tpr->type) {
3084 case T_DATA_IND: {
3085 if ((so->so_state &
3086 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3087 (uiop->uio_resid != saved_resid) &&
3088 !(flags & MSG_PEEK)) {
3089 sorecv_update_oobstate(so);
3090 }
3091
3092 /*
3093 * Set msg_flags to MSG_EOR based on
3094 * MORE_flag and MOREDATA.
3095 */
3096 mutex_enter(&so->so_lock);
3097 so->so_state &= ~SS_SAVEDEOR;
3098 if (!(tpr->data_ind.MORE_flag & 1)) {
3099 if (!(rval.r_val1 & MOREDATA))
3100 msg->msg_flags |= MSG_EOR;
3101 else
3102 so->so_state |= SS_SAVEDEOR;
3103 }
3104 freemsg(mp);
3105 /*
3106 * If some data was received (i.e. not EOF) and the
3107 * read/recv* has not been satisfied wait for some more.
3108 */
3109 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3110 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3111 mutex_exit(&so->so_lock);
3112 pflag = opflag | MSG_NOMARK;
3113 goto retry;
3114 }
3115 goto out_locked;
3116 }
3117 case T_UNITDATA_IND: {
3118 void *addr;
3119 t_uscalar_t addrlen;
3120 void *abuf;
3121 t_uscalar_t optlen;
3122 void *opt;
3123
3124 if ((so->so_state &
3125 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3126 (uiop->uio_resid != saved_resid) &&
3127 !(flags & MSG_PEEK)) {
3128 sorecv_update_oobstate(so);
3129 }
3130
3131 if (namelen != 0) {
3132 /* Caller wants source address */
3133 addrlen = tpr->unitdata_ind.SRC_length;
3134 addr = sogetoff(mp,
3135 tpr->unitdata_ind.SRC_offset,
3136 addrlen, 1);
3137 if (addr == NULL) {
3138 freemsg(mp);
3139 error = EPROTO;
3140 eprintsoline(so, error);
3141 goto out;
3142 }
3143 if (so->so_family == AF_UNIX) {
3144 /*
3145 * Can not use the transport level address.
3146 * If there is a SO_SRCADDR option carrying
3147 * the socket level address it will be
3148 * extracted below.
3149 */
3150 addr = NULL;
3151 addrlen = 0;
3152 }
3153 }
3154 optlen = tpr->unitdata_ind.OPT_length;
3155 if (optlen != 0) {
3156 t_uscalar_t ncontrollen;
3157
3158 /*
3159 * Extract any source address option.
3160 * Determine how large cmsg buffer is needed.
3161 */
3162 opt = sogetoff(mp,
3163 tpr->unitdata_ind.OPT_offset,
3164 optlen, __TPI_ALIGN_SIZE);
3165
3166 if (opt == NULL) {
3167 freemsg(mp);
3168 error = EPROTO;
3169 eprintsoline(so, error);
3170 goto out;
3171 }
3172 if (so->so_family == AF_UNIX)
3173 so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3174 ncontrollen = so_cmsglen(mp, opt, optlen,
3175 !(flags & MSG_XPG4_2));
3176 if (controllen != 0)
3177 controllen = ncontrollen;
3178 else if (ncontrollen != 0)
3179 msg->msg_flags |= MSG_CTRUNC;
3180 } else {
3181 controllen = 0;
3182 }
3183
3184 if (namelen != 0) {
3185 /*
3186 * Return address to caller.
3187 * Caller handles truncation if length
3188 * exceeds msg_namelen.
3189 * NOTE: AF_UNIX NUL termination is ensured by
3190 * the sender's copyin_name().
3191 */
3192 abuf = kmem_alloc(addrlen, KM_SLEEP);
3193
3194 bcopy(addr, abuf, addrlen);
3195 msg->msg_name = abuf;
3196 msg->msg_namelen = addrlen;
3197 }
3198
3199 if (controllen != 0) {
3200 /*
3201 * Return control msg to caller.
3202 * Caller handles truncation if length
3203 * exceeds msg_controllen.
3204 */
3205 control = kmem_zalloc(controllen, KM_SLEEP);
3206
3207 error = so_opt2cmsg(mp, opt, optlen, flags, control,
3208 controllen);
3209 if (error) {
3210 freemsg(mp);
3211 if (msg->msg_namelen != 0)
3212 kmem_free(msg->msg_name,
3213 msg->msg_namelen);
3214 kmem_free(control, controllen);
3215 eprintsoline(so, error);
3216 goto out;
3217 }
3218 msg->msg_control = control;
3219 msg->msg_controllen = controllen;
3220 }
3221
3222 freemsg(mp);
3223 goto out;
3224 }
3225 case T_OPTDATA_IND: {
3226 struct T_optdata_req *tdr;
3227 void *opt;
3228 t_uscalar_t optlen;
3229
3230 if ((so->so_state &
3231 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3232 (uiop->uio_resid != saved_resid) &&
3233 !(flags & MSG_PEEK)) {
3234 sorecv_update_oobstate(so);
3235 }
3236
3237 tdr = (struct T_optdata_req *)mp->b_rptr;
3238 optlen = tdr->OPT_length;
3239 if (optlen != 0) {
3240 t_uscalar_t ncontrollen;
3241 /*
3242 * Determine how large cmsg buffer is needed.
3243 */
3244 opt = sogetoff(mp,
3245 tpr->optdata_ind.OPT_offset,
3246 optlen, __TPI_ALIGN_SIZE);
3247
3248 if (opt == NULL) {
3249 freemsg(mp);
3250 error = EPROTO;
3251 eprintsoline(so, error);
3252 goto out;
3253 }
3254
3255 ncontrollen = so_cmsglen(mp, opt, optlen,
3256 !(flags & MSG_XPG4_2));
3257 if (controllen != 0)
3258 controllen = ncontrollen;
3259 else if (ncontrollen != 0)
3260 msg->msg_flags |= MSG_CTRUNC;
3261 } else {
3262 controllen = 0;
3263 }
3264
3265 if (controllen != 0) {
3266 /*
3267 * Return control msg to caller.
3268 * Caller handles truncation if length
3269 * exceeds msg_controllen.
3270 */
3271 control = kmem_zalloc(controllen, KM_SLEEP);
3272
3273 error = so_opt2cmsg(mp, opt, optlen, flags, control,
3274 controllen);
3275 if (error) {
3276 freemsg(mp);
3277 kmem_free(control, controllen);
3278 eprintsoline(so, error);
3279 goto out;
3280 }
3281 msg->msg_control = control;
3282 msg->msg_controllen = controllen;
3283 }
3284
3285 /*
3286 * Set msg_flags to MSG_EOR based on
3287 * DATA_flag and MOREDATA.
3288 */
3289 mutex_enter(&so->so_lock);
3290 so->so_state &= ~SS_SAVEDEOR;
3291 if (!(tpr->data_ind.MORE_flag & 1)) {
3292 if (!(rval.r_val1 & MOREDATA))
3293 msg->msg_flags |= MSG_EOR;
3294 else
3295 so->so_state |= SS_SAVEDEOR;
3296 }
3297 freemsg(mp);
3298 /*
3299 * If some data was received (i.e. not EOF) and the
3300 * read/recv* has not been satisfied wait for some more.
3301 * Not possible to wait if control info was received.
3302 */
3303 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3304 controllen == 0 &&
3305 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3306 mutex_exit(&so->so_lock);
3307 pflag = opflag | MSG_NOMARK;
3308 goto retry;
3309 }
3310 goto out_locked;
3311 }
3312 case T_EXDATA_IND: {
3313 dprintso(so, 1,
3314 ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3315 "state %s\n",
3316 sti->sti_oobsigcnt, sti->sti_oobcnt,
3317 saved_resid - uiop->uio_resid,
3318 pr_state(so->so_state, so->so_mode)));
3319 /*
3320 * kstrgetmsg handles MSGMARK so there is nothing to
3321 * inspect in the T_EXDATA_IND.
3322 * strsock_proto makes the stream head queue the T_EXDATA_IND
3323 * as a separate message with no M_DATA component. Furthermore,
3324 * the stream head does not consolidate M_DATA messages onto
3325 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3326 * remains a message by itself. This is needed since MSGMARK
3327 * marks both the whole message as well as the last byte
3328 * of the message.
3329 */
3330 freemsg(mp);
3331 ASSERT(uiop->uio_resid == saved_resid); /* No data */
3332 if (flags & MSG_PEEK) {
3333 /*
3334 * Even though we are peeking we consume the
3335 * T_EXDATA_IND thereby moving the mark information
3336 * to SS_RCVATMARK. Then the oob code below will
3337 * retry the peeking kstrgetmsg.
3338 * Note that the stream head read queue is
3339 * never flushed without holding SOREADLOCKED
3340 * thus the T_EXDATA_IND can not disappear
3341 * underneath us.
3342 */
3343 dprintso(so, 1,
3344 ("sotpi_recvmsg: consume EXDATA_IND "
3345 "counts %d/%d state %s\n",
3346 sti->sti_oobsigcnt,
3347 sti->sti_oobcnt,
3348 pr_state(so->so_state, so->so_mode)));
3349
3350 pflag = MSG_ANY | MSG_DELAYERROR;
3351 if (so->so_mode & SM_ATOMIC)
3352 pflag |= MSG_DISCARDTAIL;
3353
3354 pri = 0;
3355 mp = NULL;
3356
3357 error = kstrgetmsg(SOTOV(so), &mp, uiop,
3358 &pri, &pflag, (clock_t)-1, &rval);
3359 ASSERT(uiop->uio_resid == saved_resid);
3360
3361 if (error) {
3362 #ifdef SOCK_DEBUG
3363 if (error != EWOULDBLOCK && error != EINTR) {
3364 eprintsoline(so, error);
3365 }
3366 #endif /* SOCK_DEBUG */
3367 goto out;
3368 }
3369 ASSERT(mp);
3370 tpr = (union T_primitives *)mp->b_rptr;
3371 ASSERT(tpr->type == T_EXDATA_IND);
3372 freemsg(mp);
3373 } /* end "if (flags & MSG_PEEK)" */
3374
3375 /*
3376 * Decrement the number of queued and pending oob.
3377 *
3378 * SS_RCVATMARK is cleared when we read past a mark.
3379 * SS_HAVEOOBDATA is cleared when we've read past the
3380 * last mark.
3381 * SS_OOBPEND is cleared if we've read past the last
3382 * mark and no (new) SIGURG has been posted.
3383 */
3384 mutex_enter(&so->so_lock);
3385 ASSERT(so_verify_oobstate(so));
3386 ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3387 ASSERT(sti->sti_oobsigcnt > 0);
3388 sti->sti_oobsigcnt--;
3389 ASSERT(sti->sti_oobcnt > 0);
3390 sti->sti_oobcnt--;
3391 /*
3392 * Since the T_EXDATA_IND has been removed from the stream
3393 * head, but we have not read data past the mark,
3394 * sockfs needs to track that the socket is still at the mark.
3395 *
3396 * Since no data was received call kstrgetmsg again to wait
3397 * for data.
3398 */
3399 so->so_state |= SS_RCVATMARK;
3400 mutex_exit(&so->so_lock);
3401 dprintso(so, 1,
3402 ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3403 sti->sti_oobsigcnt, sti->sti_oobcnt,
3404 pr_state(so->so_state, so->so_mode)));
3405 pflag = opflag;
3406 goto retry;
3407 }
3408 default:
3409 cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3410 (void *)so, tpr->type, (void *)mp);
3411 ASSERT(0);
3412 freemsg(mp);
3413 error = EPROTO;
3414 eprintsoline(so, error);
3415 goto out;
3416 }
3417 /* NOTREACHED */
3418 out:
3419 mutex_enter(&so->so_lock);
3420 out_locked:
3421 so_unlock_read(so); /* Clear SOREADLOCKED */
3422 mutex_exit(&so->so_lock);
3423 return (error);
3424 }
3425
3426 /*
3427 * Sending data with options on a datagram socket.
3428 * Assumes caller has verified that SS_ISBOUND etc. are set.
3429 *
3430 * For AF_UNIX the destination address may be already in
3431 * internal form, as indicated by sti->sti_faddr_noxlate
3432 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
3433 * translate the destination address to internal form.
3434 *
3435 * The source address is passed as an option. If passing
3436 * file descriptors, those are passed as file pointers in
3437 * another option.
3438 */
3439 static int
sosend_dgramcmsg(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,void * control,t_uscalar_t controllen,int flags)3440 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3441 struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3442 {
3443 struct T_unitdata_req tudr;
3444 mblk_t *mp;
3445 int error;
3446 void *addr;
3447 socklen_t addrlen;
3448 void *src;
3449 socklen_t srclen;
3450 ssize_t len;
3451 int size;
3452 struct T_opthdr toh;
3453 struct fdbuf *fdbuf;
3454 t_uscalar_t optlen;
3455 void *fds;
3456 int fdlen;
3457 sotpi_info_t *sti = SOTOTPI(so);
3458
3459 ASSERT(name && namelen);
3460 ASSERT(control && controllen);
3461
3462 len = uiop->uio_resid;
3463 if (len > (ssize_t)sti->sti_tidu_size) {
3464 return (EMSGSIZE);
3465 }
3466
3467 if (sti->sti_faddr_noxlate == 0 &&
3468 (flags & MSG_SENDTO_NOXLATE) == 0) {
3469 /*
3470 * Length and family checks.
3471 * Don't verify internal form.
3472 */
3473 error = so_addr_verify(so, name, namelen);
3474 if (error) {
3475 eprintsoline(so, error);
3476 return (error);
3477 }
3478 }
3479
3480 if (so->so_family == AF_UNIX) {
3481 if (sti->sti_faddr_noxlate) {
3482 /*
3483 * Already have a transport internal address. Do not
3484 * pass any (transport internal) source address.
3485 */
3486 addr = name;
3487 addrlen = namelen;
3488 src = NULL;
3489 srclen = 0;
3490 } else if (flags & MSG_SENDTO_NOXLATE) {
3491 /*
3492 * Have an internal form dest. address.
3493 * Pass the source address as usual.
3494 */
3495 addr = name;
3496 addrlen = namelen;
3497 src = sti->sti_laddr_sa;
3498 srclen = (socklen_t)sti->sti_laddr_len;
3499 } else {
3500 /*
3501 * Pass the sockaddr_un source address as an option
3502 * and translate the remote address.
3503 *
3504 * Note that this code does not prevent sti_laddr_sa
3505 * from changing while it is being used. Thus
3506 * if an unbind+bind occurs concurrently with this
3507 * send the peer might see a partially new and a
3508 * partially old "from" address.
3509 */
3510 src = sti->sti_laddr_sa;
3511 srclen = (socklen_t)sti->sti_laddr_len;
3512 dprintso(so, 1,
3513 ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3514 srclen, src));
3515 /*
3516 * The sendmsg caller specified a destination
3517 * address, which we must translate into our
3518 * internal form. addr = &sti->sti_ux_taddr
3519 */
3520 error = so_ux_addr_xlate(so, name, namelen,
3521 (flags & MSG_XPG4_2),
3522 &addr, &addrlen);
3523 if (error) {
3524 eprintsoline(so, error);
3525 return (error);
3526 }
3527 }
3528 } else {
3529 addr = name;
3530 addrlen = namelen;
3531 src = NULL;
3532 srclen = 0;
3533 }
3534 optlen = so_optlen(control, controllen,
3535 !(flags & MSG_XPG4_2));
3536 tudr.PRIM_type = T_UNITDATA_REQ;
3537 tudr.DEST_length = addrlen;
3538 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3539 if (srclen != 0)
3540 tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3541 _TPI_ALIGN_TOPT(srclen));
3542 else
3543 tudr.OPT_length = optlen;
3544 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3545 _TPI_ALIGN_TOPT(addrlen));
3546
3547 size = tudr.OPT_offset + tudr.OPT_length;
3548
3549 /*
3550 * File descriptors only when SM_FDPASSING set.
3551 */
3552 error = so_getfdopt(control, controllen,
3553 !(flags & MSG_XPG4_2), &fds, &fdlen);
3554 if (error)
3555 return (error);
3556 if (fdlen != -1) {
3557 if (!(so->so_mode & SM_FDPASSING))
3558 return (EOPNOTSUPP);
3559
3560 error = fdbuf_create(fds, fdlen, &fdbuf);
3561 if (error)
3562 return (error);
3563
3564 /*
3565 * Pre-allocate enough additional space for lower level modules
3566 * to append an option (e.g. see tl_unitdata). The following
3567 * is enough extra space for the largest option we might append.
3568 */
3569 size += sizeof (struct T_opthdr) + ucredsize;
3570 mp = fdbuf_allocmsg(size, fdbuf);
3571 } else {
3572 mp = soallocproto(size, _ALLOC_INTR, CRED());
3573 if (mp == NULL) {
3574 /*
3575 * Caught a signal waiting for memory.
3576 * Let send* return EINTR.
3577 */
3578 return (EINTR);
3579 }
3580 }
3581 soappendmsg(mp, &tudr, sizeof (tudr));
3582 soappendmsg(mp, addr, addrlen);
3583 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3584
3585 if (fdlen != -1) {
3586 ASSERT(fdbuf != NULL);
3587 toh.level = SOL_SOCKET;
3588 toh.name = SO_FILEP;
3589 toh.len = fdbuf->fd_size +
3590 (t_uscalar_t)sizeof (struct T_opthdr);
3591 toh.status = 0;
3592 soappendmsg(mp, &toh, sizeof (toh));
3593 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3594 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3595 }
3596 if (srclen != 0) {
3597 /*
3598 * There is a AF_UNIX sockaddr_un to include as a source
3599 * address option.
3600 */
3601 toh.level = SOL_SOCKET;
3602 toh.name = SO_SRCADDR;
3603 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3604 toh.status = 0;
3605 soappendmsg(mp, &toh, sizeof (toh));
3606 soappendmsg(mp, src, srclen);
3607 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3608 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3609 }
3610 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3611 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3612 /*
3613 * Normally at most 3 bytes left in the message, but we might have
3614 * allowed for extra space if we're passing fd's through.
3615 */
3616 ASSERT(MBLKL(mp) <= (ssize_t)size);
3617
3618 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3619 if (AU_AUDITING())
3620 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3621
3622 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3623 #ifdef SOCK_DEBUG
3624 if (error) {
3625 eprintsoline(so, error);
3626 }
3627 #endif /* SOCK_DEBUG */
3628 return (error);
3629 }
3630
3631 /*
3632 * Sending data with options on a connected stream socket.
3633 * Assumes caller has verified that SS_ISCONNECTED is set.
3634 */
3635 static int
sosend_svccmsg(struct sonode * so,struct uio * uiop,int more,void * control,t_uscalar_t controllen,int flags)3636 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3637 t_uscalar_t controllen, int flags)
3638 {
3639 struct T_optdata_req tdr;
3640 mblk_t *mp;
3641 int error;
3642 ssize_t iosize;
3643 int size;
3644 struct fdbuf *fdbuf;
3645 t_uscalar_t optlen;
3646 void *fds;
3647 int fdlen;
3648 struct T_opthdr toh;
3649 sotpi_info_t *sti = SOTOTPI(so);
3650
3651 dprintso(so, 1,
3652 ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3653
3654 /*
3655 * Has to be bound and connected. However, since no locks are
3656 * held the state could have changed after sotpi_sendmsg checked it
3657 * thus it is not possible to ASSERT on the state.
3658 */
3659
3660 /* Options on connection-oriented only when SM_OPTDATA set. */
3661 if (!(so->so_mode & SM_OPTDATA))
3662 return (EOPNOTSUPP);
3663
3664 do {
3665 /*
3666 * Set the MORE flag if uio_resid does not fit in this
3667 * message or if the caller passed in "more".
3668 * Error for transports with zero tidu_size.
3669 */
3670 tdr.PRIM_type = T_OPTDATA_REQ;
3671 iosize = sti->sti_tidu_size;
3672 if (iosize <= 0)
3673 return (EMSGSIZE);
3674 if (uiop->uio_resid > iosize) {
3675 tdr.DATA_flag = 1;
3676 } else {
3677 if (more)
3678 tdr.DATA_flag = 1;
3679 else
3680 tdr.DATA_flag = 0;
3681 iosize = uiop->uio_resid;
3682 }
3683 dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3684 tdr.DATA_flag, iosize));
3685
3686 optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3687 tdr.OPT_length = optlen;
3688 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3689
3690 size = (int)sizeof (tdr) + optlen;
3691 /*
3692 * File descriptors only when SM_FDPASSING set.
3693 */
3694 error = so_getfdopt(control, controllen,
3695 !(flags & MSG_XPG4_2), &fds, &fdlen);
3696 if (error)
3697 return (error);
3698 if (fdlen != -1) {
3699 if (!(so->so_mode & SM_FDPASSING))
3700 return (EOPNOTSUPP);
3701
3702 error = fdbuf_create(fds, fdlen, &fdbuf);
3703 if (error)
3704 return (error);
3705
3706 /*
3707 * Pre-allocate enough additional space for lower level
3708 * modules to append an option (e.g. see tl_unitdata).
3709 * The following is enough extra space for the largest
3710 * option we might append.
3711 */
3712 size += sizeof (struct T_opthdr) + ucredsize;
3713 mp = fdbuf_allocmsg(size, fdbuf);
3714 } else {
3715 mp = soallocproto(size, _ALLOC_INTR, CRED());
3716 if (mp == NULL) {
3717 /*
3718 * Caught a signal waiting for memory.
3719 * Let send* return EINTR.
3720 */
3721 return (EINTR);
3722 }
3723 }
3724 soappendmsg(mp, &tdr, sizeof (tdr));
3725
3726 if (fdlen != -1) {
3727 ASSERT(fdbuf != NULL);
3728 toh.level = SOL_SOCKET;
3729 toh.name = SO_FILEP;
3730 toh.len = fdbuf->fd_size +
3731 (t_uscalar_t)sizeof (struct T_opthdr);
3732 toh.status = 0;
3733 soappendmsg(mp, &toh, sizeof (toh));
3734 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3735 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3736 }
3737 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3738 /*
3739 * Normally at most 3 bytes left in the message, but we might
3740 * have allowed for extra space if we're passing fd's through.
3741 */
3742 ASSERT(MBLKL(mp) <= (ssize_t)size);
3743
3744 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3745
3746 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3747 0, MSG_BAND, 0);
3748 if (error) {
3749 eprintsoline(so, error);
3750 return (error);
3751 }
3752 control = NULL;
3753 if (uiop->uio_resid > 0) {
3754 /*
3755 * Recheck for fatal errors. Fail write even though
3756 * some data have been written. This is consistent
3757 * with strwrite semantics and BSD sockets semantics.
3758 */
3759 if (so->so_state & SS_CANTSENDMORE) {
3760 eprintsoline(so, error);
3761 return (EPIPE);
3762 }
3763 if (so->so_error != 0) {
3764 mutex_enter(&so->so_lock);
3765 error = sogeterr(so, B_TRUE);
3766 mutex_exit(&so->so_lock);
3767 if (error != 0) {
3768 eprintsoline(so, error);
3769 return (error);
3770 }
3771 }
3772 }
3773 } while (uiop->uio_resid > 0);
3774 return (0);
3775 }
3776
3777 /*
3778 * Sending data on a datagram socket.
3779 * Assumes caller has verified that SS_ISBOUND etc. are set.
3780 *
3781 * For AF_UNIX the destination address may be already in
3782 * internal form, as indicated by sti->sti_faddr_noxlate
3783 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
3784 * translate the destination address to internal form.
3785 *
3786 * The source address is passed as an option.
3787 */
3788 int
sosend_dgram(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)3789 sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3790 struct uio *uiop, int flags)
3791 {
3792 struct T_unitdata_req tudr;
3793 mblk_t *mp;
3794 int error;
3795 void *addr;
3796 socklen_t addrlen;
3797 void *src;
3798 socklen_t srclen;
3799 ssize_t len;
3800 sotpi_info_t *sti = SOTOTPI(so);
3801
3802 ASSERT(name != NULL && namelen != 0);
3803
3804 len = uiop->uio_resid;
3805 if (len > sti->sti_tidu_size) {
3806 error = EMSGSIZE;
3807 goto done;
3808 }
3809
3810 if (sti->sti_faddr_noxlate == 0 &&
3811 (flags & MSG_SENDTO_NOXLATE) == 0) {
3812 /*
3813 * Length and family checks.
3814 * Don't verify internal form.
3815 */
3816 error = so_addr_verify(so, name, namelen);
3817 if (error != 0)
3818 goto done;
3819 }
3820
3821 if (sti->sti_direct) /* Never on AF_UNIX */
3822 return (sodgram_direct(so, name, namelen, uiop, flags));
3823
3824 if (so->so_family == AF_UNIX) {
3825 if (sti->sti_faddr_noxlate) {
3826 /*
3827 * Already have a transport internal address. Do not
3828 * pass any (transport internal) source address.
3829 */
3830 addr = name;
3831 addrlen = namelen;
3832 src = NULL;
3833 srclen = 0;
3834 } else if (flags & MSG_SENDTO_NOXLATE) {
3835 /*
3836 * Have an internal form dest. address.
3837 * Pass the source address as usual.
3838 */
3839 addr = name;
3840 addrlen = namelen;
3841 src = sti->sti_laddr_sa;
3842 srclen = (socklen_t)sti->sti_laddr_len;
3843 } else {
3844 /*
3845 * Pass the sockaddr_un source address as an option
3846 * and translate the remote address.
3847 *
3848 * Note that this code does not prevent sti_laddr_sa
3849 * from changing while it is being used. Thus
3850 * if an unbind+bind occurs concurrently with this
3851 * send the peer might see a partially new and a
3852 * partially old "from" address.
3853 */
3854 src = sti->sti_laddr_sa;
3855 srclen = (socklen_t)sti->sti_laddr_len;
3856 dprintso(so, 1,
3857 ("sosend_dgram UNIX: srclen %d, src %p\n",
3858 srclen, src));
3859 /*
3860 * The sendmsg caller specified a destination
3861 * address, which we must translate into our
3862 * internal form. addr = &sti->sti_ux_taddr
3863 */
3864 error = so_ux_addr_xlate(so, name, namelen,
3865 (flags & MSG_XPG4_2),
3866 &addr, &addrlen);
3867 if (error) {
3868 eprintsoline(so, error);
3869 goto done;
3870 }
3871 }
3872 } else {
3873 addr = name;
3874 addrlen = namelen;
3875 src = NULL;
3876 srclen = 0;
3877 }
3878 tudr.PRIM_type = T_UNITDATA_REQ;
3879 tudr.DEST_length = addrlen;
3880 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3881 if (srclen == 0) {
3882 tudr.OPT_length = 0;
3883 tudr.OPT_offset = 0;
3884
3885 mp = soallocproto2(&tudr, sizeof (tudr),
3886 addr, addrlen, 0, _ALLOC_INTR, CRED());
3887 if (mp == NULL) {
3888 /*
3889 * Caught a signal waiting for memory.
3890 * Let send* return EINTR.
3891 */
3892 error = EINTR;
3893 goto done;
3894 }
3895 } else {
3896 /*
3897 * There is a AF_UNIX sockaddr_un to include as a source
3898 * address option.
3899 */
3900 struct T_opthdr toh;
3901 ssize_t size;
3902
3903 tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3904 _TPI_ALIGN_TOPT(srclen));
3905 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3906 _TPI_ALIGN_TOPT(addrlen));
3907
3908 toh.level = SOL_SOCKET;
3909 toh.name = SO_SRCADDR;
3910 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3911 toh.status = 0;
3912
3913 size = tudr.OPT_offset + tudr.OPT_length;
3914 mp = soallocproto2(&tudr, sizeof (tudr),
3915 addr, addrlen, size, _ALLOC_INTR, CRED());
3916 if (mp == NULL) {
3917 /*
3918 * Caught a signal waiting for memory.
3919 * Let send* return EINTR.
3920 */
3921 error = EINTR;
3922 goto done;
3923 }
3924 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3925 soappendmsg(mp, &toh, sizeof (toh));
3926 soappendmsg(mp, src, srclen);
3927 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3928 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3929 }
3930
3931 if (AU_AUDITING())
3932 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3933
3934 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3935 done:
3936 #ifdef SOCK_DEBUG
3937 if (error) {
3938 eprintsoline(so, error);
3939 }
3940 #endif /* SOCK_DEBUG */
3941 return (error);
3942 }
3943
3944 /*
3945 * Sending data on a connected stream socket.
3946 * Assumes caller has verified that SS_ISCONNECTED is set.
3947 */
3948 int
sosend_svc(struct sonode * so,struct uio * uiop,t_scalar_t prim,int more,int sflag)3949 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
3950 int sflag)
3951 {
3952 struct T_data_req tdr;
3953 mblk_t *mp;
3954 int error;
3955 ssize_t iosize;
3956 sotpi_info_t *sti = SOTOTPI(so);
3957
3958 dprintso(so, 1,
3959 ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
3960 (void *)so, uiop->uio_resid, prim, sflag));
3961
3962 /*
3963 * Has to be bound and connected. However, since no locks are
3964 * held the state could have changed after sotpi_sendmsg checked it
3965 * thus it is not possible to ASSERT on the state.
3966 */
3967
3968 do {
3969 /*
3970 * Set the MORE flag if uio_resid does not fit in this
3971 * message or if the caller passed in "more".
3972 * Error for transports with zero tidu_size.
3973 */
3974 tdr.PRIM_type = prim;
3975 iosize = sti->sti_tidu_size;
3976 if (iosize <= 0)
3977 return (EMSGSIZE);
3978 if (uiop->uio_resid > iosize) {
3979 tdr.MORE_flag = 1;
3980 } else {
3981 if (more)
3982 tdr.MORE_flag = 1;
3983 else
3984 tdr.MORE_flag = 0;
3985 iosize = uiop->uio_resid;
3986 }
3987 dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
3988 prim, tdr.MORE_flag, iosize));
3989 mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
3990 if (mp == NULL) {
3991 /*
3992 * Caught a signal waiting for memory.
3993 * Let send* return EINTR.
3994 */
3995 return (EINTR);
3996 }
3997
3998 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3999 0, sflag | MSG_BAND, 0);
4000 if (error) {
4001 eprintsoline(so, error);
4002 return (error);
4003 }
4004 if (uiop->uio_resid > 0) {
4005 /*
4006 * Recheck for fatal errors. Fail write even though
4007 * some data have been written. This is consistent
4008 * with strwrite semantics and BSD sockets semantics.
4009 */
4010 if (so->so_state & SS_CANTSENDMORE) {
4011 eprintsoline(so, error);
4012 return (EPIPE);
4013 }
4014 if (so->so_error != 0) {
4015 mutex_enter(&so->so_lock);
4016 error = sogeterr(so, B_TRUE);
4017 mutex_exit(&so->so_lock);
4018 if (error != 0) {
4019 eprintsoline(so, error);
4020 return (error);
4021 }
4022 }
4023 }
4024 } while (uiop->uio_resid > 0);
4025 return (0);
4026 }
4027
4028 /*
4029 * Check the state for errors and call the appropriate send function.
4030 *
4031 * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4032 * this function issues a setsockopt to toggle SO_DONTROUTE before and
4033 * after sending the message.
4034 *
4035 * The caller may optionally specify a destination address, for either
4036 * stream or datagram sockets. This table summarizes the cases:
4037 *
4038 * Socket type Dest. given Connected Result
4039 * ----------- ----------- --------- --------------
4040 * Stream * Yes send to conn. addr.
4041 * Stream * No error ENOTCONN
4042 * Dgram yes * send to given addr.
4043 * Dgram no yes send to conn. addr.
4044 * Dgram no no error EDESTADDRREQ
4045 *
4046 * There are subtleties around the destination address when using
4047 * AF_UNIX datagram sockets. When the sendmsg call specifies the
4048 * destination address, it's in (struct sockaddr_un) form and we
4049 * need to translate it to our internal form (struct so_ux_addr).
4050 *
4051 * When the sendmsg call does not specify a destination address
4052 * we're using the peer address saved during sotpi_connect, and
4053 * that address is already in internal form. In this case, the
4054 * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4055 * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4056 * those functions should skip translation to internal form.
4057 * Avoiding that translation is not only more efficient, but it's
4058 * also necessary when a process does a connect on an AF_UNIX
4059 * datagram socket and then drops privileges. After the process
4060 * has dropped privileges, it may no longer be able to lookup the
4061 * the external name in the filesystem, but it should still be
4062 * able to send messages on the connected socket by leaving the
4063 * destination name unspecified.
4064 *
4065 * Yet more subtleties arise with sockets connected by socketpair(),
4066 * which puts internal form addresses in the fields where normally
4067 * the external form is found, and sets sti_faddr_noxlate=1, which
4068 * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4069 * to skip translation of destination addresses to internal form.
4070 * However, beware that the flag sti_faddr_noxlate=1 also triggers
4071 * different behaviour almost everywhere AF_UNIX addresses appear.
4072 */
4073 static int
sotpi_sendmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)4074 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4075 struct cred *cr)
4076 {
4077 int so_state;
4078 int so_mode;
4079 int error;
4080 struct sockaddr *name;
4081 t_uscalar_t namelen;
4082 int dontroute;
4083 int flags;
4084 sotpi_info_t *sti = SOTOTPI(so);
4085
4086 dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4087 (void *)so, (void *)msg, msg->msg_flags,
4088 pr_state(so->so_state, so->so_mode), so->so_error));
4089
4090 if (so->so_version == SOV_STREAM) {
4091 /* The imaginary "sockmod" has been popped - act as a stream */
4092 so_update_attrs(so, SOMOD);
4093 return (strwrite(SOTOV(so), uiop, cr));
4094 }
4095
4096 mutex_enter(&so->so_lock);
4097 so_state = so->so_state;
4098
4099 if (so_state & SS_CANTSENDMORE) {
4100 mutex_exit(&so->so_lock);
4101 return (EPIPE);
4102 }
4103
4104 if (so->so_error != 0) {
4105 error = sogeterr(so, B_TRUE);
4106 if (error != 0) {
4107 mutex_exit(&so->so_lock);
4108 return (error);
4109 }
4110 }
4111
4112 name = (struct sockaddr *)msg->msg_name;
4113 namelen = msg->msg_namelen;
4114 flags = msg->msg_flags;
4115
4116 /*
4117 * Historically, this function does not validate the flags
4118 * passed in, and any errant bits are ignored. However,
4119 * we would not want any such errant flag bits accidently
4120 * being treated as one of the internal-only flags, so
4121 * clear the internal-only flag bits.
4122 */
4123 flags &= ~MSG_SENDTO_NOXLATE;
4124
4125 so_mode = so->so_mode;
4126
4127 if (name == NULL) {
4128 if (!(so_state & SS_ISCONNECTED)) {
4129 mutex_exit(&so->so_lock);
4130 if (so_mode & SM_CONNREQUIRED)
4131 return (ENOTCONN);
4132 else
4133 return (EDESTADDRREQ);
4134 }
4135 /*
4136 * This is a connected socket.
4137 */
4138 if (so_mode & SM_CONNREQUIRED) {
4139 /*
4140 * This is a connected STREAM socket,
4141 * destination not specified.
4142 */
4143 name = NULL;
4144 namelen = 0;
4145 } else {
4146 /*
4147 * Datagram send on connected socket with
4148 * the destination name not specified.
4149 * Use the peer address from connect.
4150 */
4151 if (so->so_family == AF_UNIX) {
4152 /*
4153 * Use the (internal form) address saved
4154 * in sotpi_connect. See above.
4155 */
4156 name = (void *)&sti->sti_ux_faddr;
4157 namelen = sizeof (sti->sti_ux_faddr);
4158 flags |= MSG_SENDTO_NOXLATE;
4159 } else {
4160 ASSERT(sti->sti_faddr_sa);
4161 name = sti->sti_faddr_sa;
4162 namelen = (t_uscalar_t)sti->sti_faddr_len;
4163 }
4164 }
4165 } else {
4166 /*
4167 * Sendmsg specifies a destination name
4168 */
4169 if (!(so_state & SS_ISCONNECTED) &&
4170 (so_mode & SM_CONNREQUIRED)) {
4171 /* i.e. TCP not connected */
4172 mutex_exit(&so->so_lock);
4173 return (ENOTCONN);
4174 }
4175 /*
4176 * Ignore the address on connection-oriented sockets.
4177 * Just like BSD this code does not generate an error for
4178 * TCP (a CONNREQUIRED socket) when sending to an address
4179 * passed in with sendto/sendmsg. Instead the data is
4180 * delivered on the connection as if no address had been
4181 * supplied.
4182 */
4183 if ((so_state & SS_ISCONNECTED) &&
4184 !(so_mode & SM_CONNREQUIRED)) {
4185 mutex_exit(&so->so_lock);
4186 return (EISCONN);
4187 }
4188 if (!(so_state & SS_ISBOUND)) {
4189 so_lock_single(so); /* Set SOLOCKED */
4190 error = sotpi_bind(so, NULL, 0,
4191 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4192 so_unlock_single(so, SOLOCKED);
4193 if (error) {
4194 mutex_exit(&so->so_lock);
4195 eprintsoline(so, error);
4196 return (error);
4197 }
4198 }
4199 /*
4200 * Handle delayed datagram errors. These are only queued
4201 * when the application sets SO_DGRAM_ERRIND.
4202 * Return the error if we are sending to the address
4203 * that was returned in the last T_UDERROR_IND.
4204 * If sending to some other address discard the delayed
4205 * error indication.
4206 */
4207 if (sti->sti_delayed_error) {
4208 struct T_uderror_ind *tudi;
4209 void *addr;
4210 t_uscalar_t addrlen;
4211 boolean_t match = B_FALSE;
4212
4213 ASSERT(sti->sti_eaddr_mp);
4214 error = sti->sti_delayed_error;
4215 sti->sti_delayed_error = 0;
4216 tudi =
4217 (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4218 addrlen = tudi->DEST_length;
4219 addr = sogetoff(sti->sti_eaddr_mp,
4220 tudi->DEST_offset, addrlen, 1);
4221 ASSERT(addr); /* Checked by strsock_proto */
4222 switch (so->so_family) {
4223 case AF_INET: {
4224 /* Compare just IP address and port */
4225 sin_t *sin1 = (sin_t *)name;
4226 sin_t *sin2 = (sin_t *)addr;
4227
4228 if (addrlen == sizeof (sin_t) &&
4229 namelen == addrlen &&
4230 sin1->sin_port == sin2->sin_port &&
4231 sin1->sin_addr.s_addr ==
4232 sin2->sin_addr.s_addr)
4233 match = B_TRUE;
4234 break;
4235 }
4236 case AF_INET6: {
4237 /* Compare just IP address and port. Not flow */
4238 sin6_t *sin1 = (sin6_t *)name;
4239 sin6_t *sin2 = (sin6_t *)addr;
4240
4241 if (addrlen == sizeof (sin6_t) &&
4242 namelen == addrlen &&
4243 sin1->sin6_port == sin2->sin6_port &&
4244 IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4245 &sin2->sin6_addr))
4246 match = B_TRUE;
4247 break;
4248 }
4249 case AF_UNIX:
4250 default:
4251 if (namelen == addrlen &&
4252 bcmp(name, addr, namelen) == 0)
4253 match = B_TRUE;
4254 }
4255 if (match) {
4256 freemsg(sti->sti_eaddr_mp);
4257 sti->sti_eaddr_mp = NULL;
4258 mutex_exit(&so->so_lock);
4259 #ifdef DEBUG
4260 dprintso(so, 0,
4261 ("sockfs delayed error %d for %s\n",
4262 error,
4263 pr_addr(so->so_family, name, namelen)));
4264 #endif /* DEBUG */
4265 return (error);
4266 }
4267 freemsg(sti->sti_eaddr_mp);
4268 sti->sti_eaddr_mp = NULL;
4269 }
4270 }
4271 mutex_exit(&so->so_lock);
4272
4273 dontroute = 0;
4274 if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4275 uint32_t val;
4276
4277 val = 1;
4278 error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4279 &val, (t_uscalar_t)sizeof (val), cr);
4280 if (error)
4281 return (error);
4282 dontroute = 1;
4283 }
4284
4285 if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4286 error = EOPNOTSUPP;
4287 goto done;
4288 }
4289 if (msg->msg_controllen != 0) {
4290 if (!(so_mode & SM_CONNREQUIRED)) {
4291 so_update_attrs(so, SOMOD);
4292 error = sosend_dgramcmsg(so, name, namelen, uiop,
4293 msg->msg_control, msg->msg_controllen, flags);
4294 } else {
4295 if (flags & MSG_OOB) {
4296 /* Can't generate T_EXDATA_REQ with options */
4297 error = EOPNOTSUPP;
4298 goto done;
4299 }
4300 so_update_attrs(so, SOMOD);
4301 error = sosend_svccmsg(so, uiop,
4302 !(flags & MSG_EOR),
4303 msg->msg_control, msg->msg_controllen,
4304 flags);
4305 }
4306 goto done;
4307 }
4308
4309 so_update_attrs(so, SOMOD);
4310 if (!(so_mode & SM_CONNREQUIRED)) {
4311 /*
4312 * If there is no SO_DONTROUTE to turn off return immediately
4313 * from send_dgram. This can allow tail-call optimizations.
4314 */
4315 if (!dontroute) {
4316 return (sosend_dgram(so, name, namelen, uiop, flags));
4317 }
4318 error = sosend_dgram(so, name, namelen, uiop, flags);
4319 } else {
4320 t_scalar_t prim;
4321 int sflag;
4322
4323 /* Ignore msg_name in the connected state */
4324 if (flags & MSG_OOB) {
4325 prim = T_EXDATA_REQ;
4326 /*
4327 * Send down T_EXDATA_REQ even if there is flow
4328 * control for data.
4329 */
4330 sflag = MSG_IGNFLOW;
4331 } else {
4332 if (so_mode & SM_BYTESTREAM) {
4333 /* Byte stream transport - use write */
4334 dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4335
4336 /* Send M_DATA messages */
4337 /*
4338 * If there is no SO_DONTROUTE to turn off,
4339 * sti_direct is on, and there is no flow
4340 * control, we can take the fast path.
4341 */
4342 if (!dontroute && sti->sti_direct != 0 &&
4343 canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4344 return (sostream_direct(so, uiop,
4345 NULL, cr));
4346 }
4347 error = strwrite(SOTOV(so), uiop, cr);
4348 goto done;
4349 }
4350 prim = T_DATA_REQ;
4351 sflag = 0;
4352 }
4353 /*
4354 * If there is no SO_DONTROUTE to turn off return immediately
4355 * from sosend_svc. This can allow tail-call optimizations.
4356 */
4357 if (!dontroute)
4358 return (sosend_svc(so, uiop, prim,
4359 !(flags & MSG_EOR), sflag));
4360 error = sosend_svc(so, uiop, prim,
4361 !(flags & MSG_EOR), sflag);
4362 }
4363 ASSERT(dontroute);
4364 done:
4365 if (dontroute) {
4366 uint32_t val;
4367
4368 val = 0;
4369 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4370 &val, (t_uscalar_t)sizeof (val), cr);
4371 }
4372 return (error);
4373 }
4374
4375 /*
4376 * kstrwritemp() has very similar semantics as that of strwrite().
4377 * The main difference is it obtains mblks from the caller and also
4378 * does not do any copy as done in strwrite() from user buffers to
4379 * kernel buffers.
4380 *
4381 * Currently, this routine is used by sendfile to send data allocated
4382 * within the kernel without any copying. This interface does not use the
4383 * synchronous stream interface as synch. stream interface implies
4384 * copying.
4385 */
4386 int
kstrwritemp(struct vnode * vp,mblk_t * mp,ushort_t fmode)4387 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4388 {
4389 struct stdata *stp;
4390 struct queue *wqp;
4391 mblk_t *newmp;
4392 char waitflag;
4393 int tempmode;
4394 int error = 0;
4395 int done = 0;
4396 struct sonode *so;
4397 boolean_t direct;
4398
4399 ASSERT(vp->v_stream);
4400 stp = vp->v_stream;
4401
4402 so = VTOSO(vp);
4403 direct = _SOTOTPI(so)->sti_direct;
4404
4405 /*
4406 * This is the sockfs direct fast path. canputnext() need
4407 * not be accurate so we don't grab the sd_lock here. If
4408 * we get flow-controlled, we grab sd_lock just before the
4409 * do..while loop below to emulate what strwrite() does.
4410 */
4411 wqp = stp->sd_wrq;
4412 if (canputnext(wqp) && direct &&
4413 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4414 return (sostream_direct(so, NULL, mp, CRED()));
4415 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4416 /* Fast check of flags before acquiring the lock */
4417 mutex_enter(&stp->sd_lock);
4418 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4419 mutex_exit(&stp->sd_lock);
4420 if (error != 0) {
4421 if (!(stp->sd_flag & STPLEX) &&
4422 (stp->sd_wput_opt & SW_SIGPIPE)) {
4423 error = EPIPE;
4424 }
4425 return (error);
4426 }
4427 }
4428
4429 waitflag = WRITEWAIT;
4430 if (stp->sd_flag & OLDNDELAY)
4431 tempmode = fmode & ~FNDELAY;
4432 else
4433 tempmode = fmode;
4434
4435 mutex_enter(&stp->sd_lock);
4436 do {
4437 if (canputnext(wqp)) {
4438 mutex_exit(&stp->sd_lock);
4439 if (stp->sd_wputdatafunc != NULL) {
4440 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4441 NULL, NULL, NULL);
4442 if (newmp == NULL) {
4443 /* The caller will free mp */
4444 return (ECOMM);
4445 }
4446 mp = newmp;
4447 }
4448 putnext(wqp, mp);
4449 return (0);
4450 }
4451 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4452 &done);
4453 } while (error == 0 && !done);
4454
4455 mutex_exit(&stp->sd_lock);
4456 /*
4457 * EAGAIN tells the application to try again. ENOMEM
4458 * is returned only if the memory allocation size
4459 * exceeds the physical limits of the system. ENOMEM
4460 * can't be true here.
4461 */
4462 if (error == ENOMEM)
4463 error = EAGAIN;
4464 return (error);
4465 }
4466
4467 /* ARGSUSED */
4468 static int
sotpi_sendmblk(struct sonode * so,struct nmsghdr * msg,int fflag,struct cred * cr,mblk_t ** mpp)4469 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4470 struct cred *cr, mblk_t **mpp)
4471 {
4472 int error;
4473
4474 switch (so->so_family) {
4475 case AF_INET:
4476 case AF_INET6:
4477 case AF_UNIX:
4478 break;
4479 default:
4480 return (EAFNOSUPPORT);
4481
4482 }
4483
4484 if (so->so_state & SS_CANTSENDMORE)
4485 return (EPIPE);
4486
4487 if (so->so_type != SOCK_STREAM)
4488 return (EOPNOTSUPP);
4489
4490 if ((so->so_state & SS_ISCONNECTED) == 0)
4491 return (ENOTCONN);
4492
4493 error = kstrwritemp(so->so_vnode, *mpp, fflag);
4494 if (error == 0)
4495 *mpp = NULL;
4496 return (error);
4497 }
4498
4499 /*
4500 * Sending data on a datagram socket.
4501 * Assumes caller has verified that SS_ISBOUND etc. are set.
4502 */
4503 /* ARGSUSED */
4504 static int
sodgram_direct(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)4505 sodgram_direct(struct sonode *so, struct sockaddr *name,
4506 socklen_t namelen, struct uio *uiop, int flags)
4507 {
4508 struct T_unitdata_req tudr;
4509 mblk_t *mp = NULL;
4510 int error = 0;
4511 void *addr;
4512 socklen_t addrlen;
4513 ssize_t len;
4514 struct stdata *stp = SOTOV(so)->v_stream;
4515 int so_state;
4516 queue_t *udp_wq;
4517 boolean_t connected;
4518 mblk_t *mpdata = NULL;
4519 sotpi_info_t *sti = SOTOTPI(so);
4520 uint32_t auditing = AU_AUDITING();
4521
4522 ASSERT(name != NULL && namelen != 0);
4523 ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4524 ASSERT(!(so->so_mode & SM_EXDATA));
4525 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4526 ASSERT(SOTOV(so)->v_type == VSOCK);
4527
4528 /* Caller checked for proper length */
4529 len = uiop->uio_resid;
4530 ASSERT(len <= sti->sti_tidu_size);
4531
4532 /* Length and family checks have been done by caller */
4533 ASSERT(name->sa_family == so->so_family);
4534 ASSERT(so->so_family == AF_INET ||
4535 (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4536 ASSERT(so->so_family == AF_INET6 ||
4537 (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4538
4539 addr = name;
4540 addrlen = namelen;
4541
4542 if (stp->sd_sidp != NULL &&
4543 (error = straccess(stp, JCWRITE)) != 0)
4544 goto done;
4545
4546 so_state = so->so_state;
4547
4548 connected = so_state & SS_ISCONNECTED;
4549 if (!connected) {
4550 tudr.PRIM_type = T_UNITDATA_REQ;
4551 tudr.DEST_length = addrlen;
4552 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4553 tudr.OPT_length = 0;
4554 tudr.OPT_offset = 0;
4555
4556 mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4557 _ALLOC_INTR, CRED());
4558 if (mp == NULL) {
4559 /*
4560 * Caught a signal waiting for memory.
4561 * Let send* return EINTR.
4562 */
4563 error = EINTR;
4564 goto done;
4565 }
4566 }
4567
4568 /*
4569 * For UDP we don't break up the copyin into smaller pieces
4570 * as in the TCP case. That means if ENOMEM is returned by
4571 * mcopyinuio() then the uio vector has not been modified at
4572 * all and we fallback to either strwrite() or kstrputmsg()
4573 * below. Note also that we never generate priority messages
4574 * from here.
4575 */
4576 udp_wq = stp->sd_wrq->q_next;
4577 if (canput(udp_wq) &&
4578 (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4579 ASSERT(DB_TYPE(mpdata) == M_DATA);
4580 ASSERT(uiop->uio_resid == 0);
4581 if (!connected)
4582 linkb(mp, mpdata);
4583 else
4584 mp = mpdata;
4585 if (auditing)
4586 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4587
4588 /* Always returns 0... */
4589 return (udp_wput(udp_wq, mp));
4590 }
4591
4592 ASSERT(mpdata == NULL);
4593 if (error != 0 && error != ENOMEM) {
4594 freemsg(mp);
4595 return (error);
4596 }
4597
4598 /*
4599 * For connected, let strwrite() handle the blocking case.
4600 * Otherwise we fall thru and use kstrputmsg().
4601 */
4602 if (connected)
4603 return (strwrite(SOTOV(so), uiop, CRED()));
4604
4605 if (auditing)
4606 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4607
4608 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4609 done:
4610 #ifdef SOCK_DEBUG
4611 if (error != 0) {
4612 eprintsoline(so, error);
4613 }
4614 #endif /* SOCK_DEBUG */
4615 return (error);
4616 }
4617
4618 int
sostream_direct(struct sonode * so,struct uio * uiop,mblk_t * mp,cred_t * cr)4619 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4620 {
4621 struct stdata *stp = SOTOV(so)->v_stream;
4622 ssize_t iosize, rmax, maxblk;
4623 queue_t *tcp_wq = stp->sd_wrq->q_next;
4624 mblk_t *newmp;
4625 int error = 0, wflag = 0;
4626
4627 ASSERT(so->so_mode & SM_BYTESTREAM);
4628 ASSERT(SOTOV(so)->v_type == VSOCK);
4629
4630 if (stp->sd_sidp != NULL &&
4631 (error = straccess(stp, JCWRITE)) != 0)
4632 return (error);
4633
4634 if (uiop == NULL) {
4635 /*
4636 * kstrwritemp() should have checked sd_flag and
4637 * flow-control before coming here. If we end up
4638 * here it means that we can simply pass down the
4639 * data to tcp.
4640 */
4641 ASSERT(mp != NULL);
4642 if (stp->sd_wputdatafunc != NULL) {
4643 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4644 NULL, NULL, NULL);
4645 if (newmp == NULL) {
4646 /* The caller will free mp */
4647 return (ECOMM);
4648 }
4649 mp = newmp;
4650 }
4651 /* Always returns 0... */
4652 return (tcp_wput(tcp_wq, mp));
4653 }
4654
4655 /* Fallback to strwrite() to do proper error handling */
4656 if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4657 return (strwrite(SOTOV(so), uiop, cr));
4658
4659 rmax = stp->sd_qn_maxpsz;
4660 ASSERT(rmax >= 0 || rmax == INFPSZ);
4661 if (rmax == 0 || uiop->uio_resid <= 0)
4662 return (0);
4663
4664 if (rmax == INFPSZ)
4665 rmax = uiop->uio_resid;
4666
4667 maxblk = stp->sd_maxblk;
4668
4669 for (;;) {
4670 iosize = MIN(uiop->uio_resid, rmax);
4671
4672 mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4673 if (mp == NULL) {
4674 /*
4675 * Fallback to strwrite() for ENOMEM; if this
4676 * is our first time in this routine and the uio
4677 * vector has not been modified, we will end up
4678 * calling strwrite() without any flag set.
4679 */
4680 if (error == ENOMEM)
4681 goto slow_send;
4682 else
4683 return (error);
4684 }
4685 ASSERT(uiop->uio_resid >= 0);
4686 /*
4687 * If mp is non-NULL and ENOMEM is set, it means that
4688 * mcopyinuio() was able to break down some of the user
4689 * data into one or more mblks. Send the partial data
4690 * to tcp and let the rest be handled in strwrite().
4691 */
4692 ASSERT(error == 0 || error == ENOMEM);
4693 if (stp->sd_wputdatafunc != NULL) {
4694 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4695 NULL, NULL, NULL);
4696 if (newmp == NULL) {
4697 /* The caller will free mp */
4698 return (ECOMM);
4699 }
4700 mp = newmp;
4701 }
4702 (void) tcp_wput(tcp_wq, mp); /* Always returns 0 anyway. */
4703
4704 wflag |= NOINTR;
4705
4706 if (uiop->uio_resid == 0) { /* No more data; we're done */
4707 ASSERT(error == 0);
4708 break;
4709 } else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4710 (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4711 slow_send:
4712 /*
4713 * We were able to send down partial data using
4714 * the direct call interface, but are now relying
4715 * on strwrite() to handle the non-fastpath cases.
4716 * If the socket is blocking we will sleep in
4717 * strwaitq() until write is permitted, otherwise,
4718 * we will need to return the amount of bytes
4719 * written so far back to the app. This is the
4720 * reason why we pass NOINTR flag to strwrite()
4721 * for non-blocking socket, because we don't want
4722 * to return EAGAIN when portion of the user data
4723 * has actually been sent down.
4724 */
4725 return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4726 }
4727 }
4728 return (0);
4729 }
4730
4731 /*
4732 * Update sti_faddr by asking the transport (unless AF_UNIX).
4733 */
4734 /* ARGSUSED */
4735 int
sotpi_getpeername(struct sonode * so,struct sockaddr * name,socklen_t * namelen,boolean_t accept,struct cred * cr)4736 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4737 boolean_t accept, struct cred *cr)
4738 {
4739 struct strbuf strbuf;
4740 int error = 0, res;
4741 void *addr;
4742 t_uscalar_t addrlen;
4743 k_sigset_t smask;
4744 sotpi_info_t *sti = SOTOTPI(so);
4745
4746 dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4747 (void *)so, pr_state(so->so_state, so->so_mode)));
4748
4749 ASSERT(*namelen > 0);
4750 mutex_enter(&so->so_lock);
4751 so_lock_single(so); /* Set SOLOCKED */
4752
4753 if (accept) {
4754 bcopy(sti->sti_faddr_sa, name,
4755 MIN(*namelen, sti->sti_faddr_len));
4756 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4757 goto done;
4758 }
4759
4760 if (!(so->so_state & SS_ISCONNECTED)) {
4761 error = ENOTCONN;
4762 goto done;
4763 }
4764 /* Added this check for X/Open */
4765 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4766 error = EINVAL;
4767 if (xnet_check_print) {
4768 printf("sockfs: X/Open getpeername check => EINVAL\n");
4769 }
4770 goto done;
4771 }
4772
4773 if (sti->sti_faddr_valid) {
4774 bcopy(sti->sti_faddr_sa, name,
4775 MIN(*namelen, sti->sti_faddr_len));
4776 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4777 goto done;
4778 }
4779
4780 #ifdef DEBUG
4781 dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4782 pr_addr(so->so_family, sti->sti_faddr_sa,
4783 (t_uscalar_t)sti->sti_faddr_len)));
4784 #endif /* DEBUG */
4785
4786 if (so->so_family == AF_UNIX) {
4787 /* Transport has different name space - return local info */
4788 if (sti->sti_faddr_noxlate)
4789 *namelen = 0;
4790 error = 0;
4791 goto done;
4792 }
4793
4794 ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4795
4796 ASSERT(sti->sti_faddr_sa);
4797 /* Allocate local buffer to use with ioctl */
4798 addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4799 mutex_exit(&so->so_lock);
4800 addr = kmem_alloc(addrlen, KM_SLEEP);
4801
4802 /*
4803 * Issue TI_GETPEERNAME with signals masked.
4804 * Put the result in sti_faddr_sa so that getpeername works after
4805 * a shutdown(output).
4806 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4807 * back to the socket.
4808 */
4809 strbuf.buf = addr;
4810 strbuf.maxlen = addrlen;
4811 strbuf.len = 0;
4812
4813 sigintr(&smask, 0);
4814 res = 0;
4815 ASSERT(cr);
4816 error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4817 0, K_TO_K, cr, &res);
4818 sigunintr(&smask);
4819
4820 mutex_enter(&so->so_lock);
4821 /*
4822 * If there is an error record the error in so_error put don't fail
4823 * the getpeername. Instead fallback on the recorded
4824 * sti->sti_faddr_sa.
4825 */
4826 if (error) {
4827 /*
4828 * Various stream head errors can be returned to the ioctl.
4829 * However, it is impossible to determine which ones of
4830 * these are really socket level errors that were incorrectly
4831 * consumed by the ioctl. Thus this code silently ignores the
4832 * error - to code explicitly does not reinstate the error
4833 * using soseterror().
4834 * Experiments have shows that at least this set of
4835 * errors are reported and should not be reinstated on the
4836 * socket:
4837 * EINVAL E.g. if an I_LINK was in effect when
4838 * getpeername was called.
4839 * EPIPE The ioctl error semantics prefer the write
4840 * side error over the read side error.
4841 * ENOTCONN The transport just got disconnected but
4842 * sockfs had not yet seen the T_DISCON_IND
4843 * when issuing the ioctl.
4844 */
4845 error = 0;
4846 } else if (res == 0 && strbuf.len > 0 &&
4847 (so->so_state & SS_ISCONNECTED)) {
4848 ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4849 sti->sti_faddr_len = (socklen_t)strbuf.len;
4850 bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4851 sti->sti_faddr_valid = 1;
4852
4853 bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4854 *namelen = sti->sti_faddr_len;
4855 }
4856 kmem_free(addr, addrlen);
4857 #ifdef DEBUG
4858 dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4859 pr_addr(so->so_family, sti->sti_faddr_sa,
4860 (t_uscalar_t)sti->sti_faddr_len)));
4861 #endif /* DEBUG */
4862 done:
4863 so_unlock_single(so, SOLOCKED);
4864 mutex_exit(&so->so_lock);
4865 return (error);
4866 }
4867
4868 /*
4869 * Update sti_laddr by asking the transport (unless AF_UNIX).
4870 */
4871 int
sotpi_getsockname(struct sonode * so,struct sockaddr * name,socklen_t * namelen,struct cred * cr)4872 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4873 struct cred *cr)
4874 {
4875 struct strbuf strbuf;
4876 int error = 0, res;
4877 void *addr;
4878 t_uscalar_t addrlen;
4879 k_sigset_t smask;
4880 sotpi_info_t *sti = SOTOTPI(so);
4881
4882 dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4883 (void *)so, pr_state(so->so_state, so->so_mode)));
4884
4885 ASSERT(*namelen > 0);
4886 mutex_enter(&so->so_lock);
4887 so_lock_single(so); /* Set SOLOCKED */
4888
4889 #ifdef DEBUG
4890
4891 dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4892 pr_addr(so->so_family, sti->sti_laddr_sa,
4893 (t_uscalar_t)sti->sti_laddr_len)));
4894 #endif /* DEBUG */
4895 if (sti->sti_laddr_valid) {
4896 bcopy(sti->sti_laddr_sa, name,
4897 MIN(*namelen, sti->sti_laddr_len));
4898 *namelen = sti->sti_laddr_len;
4899 goto done;
4900 }
4901
4902 if (so->so_family == AF_UNIX) {
4903 /*
4904 * Transport has different name space - return local info. If we
4905 * have enough space, let consumers know the family.
4906 */
4907 if (*namelen >= sizeof (sa_family_t)) {
4908 name->sa_family = AF_UNIX;
4909 *namelen = sizeof (sa_family_t);
4910 } else {
4911 *namelen = 0;
4912 }
4913 error = 0;
4914 goto done;
4915 }
4916 if (!(so->so_state & SS_ISBOUND)) {
4917 /* If not bound, then nothing to return. */
4918 error = 0;
4919 goto done;
4920 }
4921
4922 /* Allocate local buffer to use with ioctl */
4923 addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
4924 mutex_exit(&so->so_lock);
4925 addr = kmem_alloc(addrlen, KM_SLEEP);
4926
4927 /*
4928 * Issue TI_GETMYNAME with signals masked.
4929 * Put the result in sti_laddr_sa so that getsockname works after
4930 * a shutdown(output).
4931 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4932 * back to the socket.
4933 */
4934 strbuf.buf = addr;
4935 strbuf.maxlen = addrlen;
4936 strbuf.len = 0;
4937
4938 sigintr(&smask, 0);
4939 res = 0;
4940 ASSERT(cr);
4941 error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4942 0, K_TO_K, cr, &res);
4943 sigunintr(&smask);
4944
4945 mutex_enter(&so->so_lock);
4946 /*
4947 * If there is an error record the error in so_error put don't fail
4948 * the getsockname. Instead fallback on the recorded
4949 * sti->sti_laddr_sa.
4950 */
4951 if (error) {
4952 /*
4953 * Various stream head errors can be returned to the ioctl.
4954 * However, it is impossible to determine which ones of
4955 * these are really socket level errors that were incorrectly
4956 * consumed by the ioctl. Thus this code silently ignores the
4957 * error - to code explicitly does not reinstate the error
4958 * using soseterror().
4959 * Experiments have shows that at least this set of
4960 * errors are reported and should not be reinstated on the
4961 * socket:
4962 * EINVAL E.g. if an I_LINK was in effect when
4963 * getsockname was called.
4964 * EPIPE The ioctl error semantics prefer the write
4965 * side error over the read side error.
4966 */
4967 error = 0;
4968 } else if (res == 0 && strbuf.len > 0 &&
4969 (so->so_state & SS_ISBOUND)) {
4970 ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
4971 sti->sti_laddr_len = (socklen_t)strbuf.len;
4972 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
4973 sti->sti_laddr_valid = 1;
4974
4975 bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
4976 *namelen = sti->sti_laddr_len;
4977 }
4978 kmem_free(addr, addrlen);
4979 #ifdef DEBUG
4980 dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4981 pr_addr(so->so_family, sti->sti_laddr_sa,
4982 (t_uscalar_t)sti->sti_laddr_len)));
4983 #endif /* DEBUG */
4984 done:
4985 so_unlock_single(so, SOLOCKED);
4986 mutex_exit(&so->so_lock);
4987 return (error);
4988 }
4989
4990 /*
4991 * Get socket options. For SOL_SOCKET options some options are handled
4992 * by the sockfs while others use the value recorded in the sonode as a
4993 * fallback should the T_SVR4_OPTMGMT_REQ fail.
4994 *
4995 * On the return most *optlenp bytes are copied to optval.
4996 */
4997 /* ARGSUSED */
4998 int
sotpi_getsockopt(struct sonode * so,int level,int option_name,void * optval,socklen_t * optlenp,int flags,struct cred * cr)4999 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5000 void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5001 {
5002 struct T_optmgmt_req optmgmt_req;
5003 struct T_optmgmt_ack *optmgmt_ack;
5004 struct opthdr oh;
5005 struct opthdr *opt_res;
5006 mblk_t *mp = NULL;
5007 int error = 0;
5008 void *option = NULL; /* Set if fallback value */
5009 t_uscalar_t maxlen = *optlenp;
5010 t_uscalar_t len;
5011 uint32_t value;
5012 struct timeval tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5013 struct timeval32 tmo_val32;
5014 struct so_snd_bufinfo snd_bufinfo; /* used for zero copy */
5015
5016 dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5017 (void *)so, level, option_name, optval, (void *)optlenp,
5018 pr_state(so->so_state, so->so_mode)));
5019
5020 mutex_enter(&so->so_lock);
5021 so_lock_single(so); /* Set SOLOCKED */
5022
5023 len = (t_uscalar_t)sizeof (uint32_t); /* Default */
5024
5025 /*
5026 * Check for SOL_SOCKET options.
5027 * Certain SOL_SOCKET options are returned directly whereas
5028 * others only provide a default (fallback) value should
5029 * the T_SVR4_OPTMGMT_REQ fail.
5030 */
5031 if (level == SOL_SOCKET) {
5032 /* Check parameters */
5033 switch (option_name) {
5034 case SO_TYPE:
5035 case SO_ERROR:
5036 case SO_DEBUG:
5037 case SO_ACCEPTCONN:
5038 case SO_REUSEADDR:
5039 case SO_KEEPALIVE:
5040 case SO_DONTROUTE:
5041 case SO_BROADCAST:
5042 case SO_USELOOPBACK:
5043 case SO_OOBINLINE:
5044 case SO_SNDBUF:
5045 case SO_RCVBUF:
5046 #ifdef notyet
5047 case SO_SNDLOWAT:
5048 case SO_RCVLOWAT:
5049 #endif /* notyet */
5050 case SO_DOMAIN:
5051 case SO_DGRAM_ERRIND:
5052 case SO_PROTOCOL:
5053 if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5054 error = EINVAL;
5055 eprintsoline(so, error);
5056 goto done2;
5057 }
5058 break;
5059 case SO_RCVTIMEO:
5060 case SO_SNDTIMEO:
5061 if (get_udatamodel() == DATAMODEL_NONE ||
5062 get_udatamodel() == DATAMODEL_NATIVE) {
5063 if (maxlen < sizeof (struct timeval)) {
5064 error = EINVAL;
5065 eprintsoline(so, error);
5066 goto done2;
5067 }
5068 } else {
5069 if (maxlen < sizeof (struct timeval32)) {
5070 error = EINVAL;
5071 eprintsoline(so, error);
5072 goto done2;
5073 }
5074
5075 }
5076 break;
5077 case SO_LINGER:
5078 if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5079 error = EINVAL;
5080 eprintsoline(so, error);
5081 goto done2;
5082 }
5083 break;
5084 case SO_SND_BUFINFO:
5085 if (maxlen < (t_uscalar_t)
5086 sizeof (struct so_snd_bufinfo)) {
5087 error = EINVAL;
5088 eprintsoline(so, error);
5089 goto done2;
5090 }
5091 break;
5092 }
5093
5094 switch (option_name) {
5095 case SO_TYPE:
5096 value = so->so_type;
5097 option = &value;
5098 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5099
5100 case SO_ERROR:
5101 value = sogeterr(so, B_TRUE);
5102 option = &value;
5103 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5104
5105 case SO_ACCEPTCONN:
5106 if (so->so_state & SS_ACCEPTCONN)
5107 value = SO_ACCEPTCONN;
5108 else
5109 value = 0;
5110 #ifdef DEBUG
5111 if (value) {
5112 dprintso(so, 1,
5113 ("sotpi_getsockopt: 0x%x is set\n",
5114 option_name));
5115 } else {
5116 dprintso(so, 1,
5117 ("sotpi_getsockopt: 0x%x not set\n",
5118 option_name));
5119 }
5120 #endif /* DEBUG */
5121 option = &value;
5122 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5123
5124 case SO_DEBUG:
5125 case SO_REUSEADDR:
5126 case SO_KEEPALIVE:
5127 case SO_DONTROUTE:
5128 case SO_BROADCAST:
5129 case SO_USELOOPBACK:
5130 case SO_OOBINLINE:
5131 case SO_DGRAM_ERRIND:
5132 value = (so->so_options & option_name);
5133 #ifdef DEBUG
5134 if (value) {
5135 dprintso(so, 1,
5136 ("sotpi_getsockopt: 0x%x is set\n",
5137 option_name));
5138 } else {
5139 dprintso(so, 1,
5140 ("sotpi_getsockopt: 0x%x not set\n",
5141 option_name));
5142 }
5143 #endif /* DEBUG */
5144 option = &value;
5145 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5146
5147 /*
5148 * The following options are only returned by sockfs when the
5149 * T_SVR4_OPTMGMT_REQ fails.
5150 */
5151 case SO_LINGER:
5152 option = &so->so_linger;
5153 len = (t_uscalar_t)sizeof (struct linger);
5154 break;
5155 case SO_SNDBUF: {
5156 ssize_t lvalue;
5157
5158 /*
5159 * If the option has not been set then get a default
5160 * value from the read queue. This value is
5161 * returned if the transport fails
5162 * the T_SVR4_OPTMGMT_REQ.
5163 */
5164 lvalue = so->so_sndbuf;
5165 if (lvalue == 0) {
5166 mutex_exit(&so->so_lock);
5167 (void) strqget(strvp2wq(SOTOV(so))->q_next,
5168 QHIWAT, 0, &lvalue);
5169 mutex_enter(&so->so_lock);
5170 dprintso(so, 1,
5171 ("got SO_SNDBUF %ld from q\n", lvalue));
5172 }
5173 value = (int)lvalue;
5174 option = &value;
5175 len = (t_uscalar_t)sizeof (so->so_sndbuf);
5176 break;
5177 }
5178 case SO_RCVBUF: {
5179 ssize_t lvalue;
5180
5181 /*
5182 * If the option has not been set then get a default
5183 * value from the read queue. This value is
5184 * returned if the transport fails
5185 * the T_SVR4_OPTMGMT_REQ.
5186 *
5187 * XXX If SO_RCVBUF has been set and this is an
5188 * XPG 4.2 application then do not ask the transport
5189 * since the transport might adjust the value and not
5190 * return exactly what was set by the application.
5191 * For non-XPG 4.2 application we return the value
5192 * that the transport is actually using.
5193 */
5194 lvalue = so->so_rcvbuf;
5195 if (lvalue == 0) {
5196 mutex_exit(&so->so_lock);
5197 (void) strqget(RD(strvp2wq(SOTOV(so))),
5198 QHIWAT, 0, &lvalue);
5199 mutex_enter(&so->so_lock);
5200 dprintso(so, 1,
5201 ("got SO_RCVBUF %ld from q\n", lvalue));
5202 } else if (flags & _SOGETSOCKOPT_XPG4_2) {
5203 value = (int)lvalue;
5204 option = &value;
5205 goto copyout; /* skip asking transport */
5206 }
5207 value = (int)lvalue;
5208 option = &value;
5209 len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5210 break;
5211 }
5212 case SO_DOMAIN:
5213 value = so->so_family;
5214 option = &value;
5215 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5216
5217 case SO_PROTOCOL:
5218 value = so->so_protocol;
5219 option = &value;
5220 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5221
5222 #ifdef notyet
5223 /*
5224 * We do not implement the semantics of these options
5225 * thus we shouldn't implement the options either.
5226 */
5227 case SO_SNDLOWAT:
5228 value = so->so_sndlowat;
5229 option = &value;
5230 break;
5231 case SO_RCVLOWAT:
5232 value = so->so_rcvlowat;
5233 option = &value;
5234 break;
5235 #endif /* notyet */
5236 case SO_SNDTIMEO:
5237 case SO_RCVTIMEO: {
5238 clock_t val;
5239
5240 if (option_name == SO_RCVTIMEO)
5241 val = drv_hztousec(so->so_rcvtimeo);
5242 else
5243 val = drv_hztousec(so->so_sndtimeo);
5244 tmo_val.tv_sec = val / (1000 * 1000);
5245 tmo_val.tv_usec = val % (1000 * 1000);
5246 if (get_udatamodel() == DATAMODEL_NONE ||
5247 get_udatamodel() == DATAMODEL_NATIVE) {
5248 option = &tmo_val;
5249 len = sizeof (struct timeval);
5250 } else {
5251 TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5252 option = &tmo_val32;
5253 len = sizeof (struct timeval32);
5254 }
5255 break;
5256 }
5257 case SO_SND_BUFINFO: {
5258 snd_bufinfo.sbi_wroff =
5259 (so->so_proto_props).sopp_wroff;
5260 snd_bufinfo.sbi_maxblk =
5261 (so->so_proto_props).sopp_maxblk;
5262 snd_bufinfo.sbi_maxpsz =
5263 (so->so_proto_props).sopp_maxpsz;
5264 snd_bufinfo.sbi_tail =
5265 (so->so_proto_props).sopp_tail;
5266 option = &snd_bufinfo;
5267 len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5268 break;
5269 }
5270 }
5271 }
5272
5273 mutex_exit(&so->so_lock);
5274
5275 /* Send request */
5276 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5277 optmgmt_req.MGMT_flags = T_CHECK;
5278 optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5279 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5280
5281 oh.level = level;
5282 oh.name = option_name;
5283 oh.len = maxlen;
5284
5285 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5286 &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5287 /* Let option management work in the presence of data flow control */
5288 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5289 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5290 mp = NULL;
5291 mutex_enter(&so->so_lock);
5292 if (error) {
5293 eprintsoline(so, error);
5294 goto done2;
5295 }
5296 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5297 (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5298 if (error) {
5299 if (option != NULL) {
5300 /* We have a fallback value */
5301 error = 0;
5302 goto copyout;
5303 }
5304 eprintsoline(so, error);
5305 goto done2;
5306 }
5307 ASSERT(mp);
5308 optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5309 opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5310 optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5311 if (opt_res == NULL) {
5312 if (option != NULL) {
5313 /* We have a fallback value */
5314 error = 0;
5315 goto copyout;
5316 }
5317 error = EPROTO;
5318 eprintsoline(so, error);
5319 goto done;
5320 }
5321 option = &opt_res[1];
5322
5323 /* check to ensure that the option is within bounds */
5324 if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5325 (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5326 if (option != NULL) {
5327 /* We have a fallback value */
5328 error = 0;
5329 goto copyout;
5330 }
5331 error = EPROTO;
5332 eprintsoline(so, error);
5333 goto done;
5334 }
5335
5336 len = opt_res->len;
5337
5338 copyout: {
5339 t_uscalar_t size = MIN(len, maxlen);
5340 bcopy(option, optval, size);
5341 bcopy(&size, optlenp, sizeof (size));
5342 }
5343 done:
5344 freemsg(mp);
5345 done2:
5346 so_unlock_single(so, SOLOCKED);
5347 mutex_exit(&so->so_lock);
5348
5349 return (error);
5350 }
5351
5352 /*
5353 * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5354 * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5355 * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5356 * setsockopt has to work even if the transport does not support the option.
5357 */
5358 /* ARGSUSED */
5359 int
sotpi_setsockopt(struct sonode * so,int level,int option_name,const void * optval,t_uscalar_t optlen,struct cred * cr)5360 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5361 const void *optval, t_uscalar_t optlen, struct cred *cr)
5362 {
5363 struct T_optmgmt_req optmgmt_req;
5364 struct opthdr oh;
5365 mblk_t *mp;
5366 int error = 0;
5367 boolean_t handled = B_FALSE;
5368
5369 dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5370 (void *)so, level, option_name, optval, optlen,
5371 pr_state(so->so_state, so->so_mode)));
5372
5373 /* X/Open requires this check */
5374 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5375 if (xnet_check_print)
5376 printf("sockfs: X/Open setsockopt check => EINVAL\n");
5377 return (EINVAL);
5378 }
5379
5380 mutex_enter(&so->so_lock);
5381 so_lock_single(so); /* Set SOLOCKED */
5382 mutex_exit(&so->so_lock);
5383
5384 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5385 optmgmt_req.MGMT_flags = T_NEGOTIATE;
5386 optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5387 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5388
5389 oh.level = level;
5390 oh.name = option_name;
5391 oh.len = optlen;
5392
5393 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5394 &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5395 /* Let option management work in the presence of data flow control */
5396 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5397 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5398 mp = NULL;
5399 mutex_enter(&so->so_lock);
5400 if (error) {
5401 eprintsoline(so, error);
5402 goto done2;
5403 }
5404 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5405 (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5406 if (error) {
5407 eprintsoline(so, error);
5408 goto done;
5409 }
5410 ASSERT(mp);
5411 /* No need to verify T_optmgmt_ack */
5412 freemsg(mp);
5413 done:
5414 /*
5415 * Check for SOL_SOCKET options and record their values.
5416 * If we know about a SOL_SOCKET parameter and the transport
5417 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5418 * EPROTO) we let the setsockopt succeed.
5419 */
5420 if (level == SOL_SOCKET) {
5421 /* Check parameters */
5422 switch (option_name) {
5423 case SO_DEBUG:
5424 case SO_REUSEADDR:
5425 case SO_KEEPALIVE:
5426 case SO_DONTROUTE:
5427 case SO_BROADCAST:
5428 case SO_USELOOPBACK:
5429 case SO_OOBINLINE:
5430 case SO_SNDBUF:
5431 case SO_RCVBUF:
5432 #ifdef notyet
5433 case SO_SNDLOWAT:
5434 case SO_RCVLOWAT:
5435 #endif /* notyet */
5436 case SO_DGRAM_ERRIND:
5437 if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5438 error = EINVAL;
5439 eprintsoline(so, error);
5440 goto done2;
5441 }
5442 ASSERT(optval);
5443 handled = B_TRUE;
5444 break;
5445 case SO_SNDTIMEO:
5446 case SO_RCVTIMEO:
5447 if (get_udatamodel() == DATAMODEL_NONE ||
5448 get_udatamodel() == DATAMODEL_NATIVE) {
5449 if (optlen != sizeof (struct timeval)) {
5450 error = EINVAL;
5451 eprintsoline(so, error);
5452 goto done2;
5453 }
5454 } else {
5455 if (optlen != sizeof (struct timeval32)) {
5456 error = EINVAL;
5457 eprintsoline(so, error);
5458 goto done2;
5459 }
5460 }
5461 ASSERT(optval);
5462 handled = B_TRUE;
5463 break;
5464 case SO_LINGER:
5465 if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5466 error = EINVAL;
5467 eprintsoline(so, error);
5468 goto done2;
5469 }
5470 ASSERT(optval);
5471 handled = B_TRUE;
5472 break;
5473 }
5474
5475 #define intvalue (*(int32_t *)optval)
5476
5477 switch (option_name) {
5478 case SO_TYPE:
5479 case SO_ERROR:
5480 case SO_ACCEPTCONN:
5481 /* Can't be set */
5482 error = ENOPROTOOPT;
5483 goto done2;
5484 case SO_LINGER: {
5485 struct linger *l = (struct linger *)optval;
5486
5487 so->so_linger.l_linger = l->l_linger;
5488 if (l->l_onoff) {
5489 so->so_linger.l_onoff = SO_LINGER;
5490 so->so_options |= SO_LINGER;
5491 } else {
5492 so->so_linger.l_onoff = 0;
5493 so->so_options &= ~SO_LINGER;
5494 }
5495 break;
5496 }
5497
5498 case SO_DEBUG:
5499 #ifdef SOCK_TEST
5500 if (intvalue & 2)
5501 sock_test_timelimit = 10 * hz;
5502 else
5503 sock_test_timelimit = 0;
5504
5505 if (intvalue & 4)
5506 do_useracc = 0;
5507 else
5508 do_useracc = 1;
5509 #endif /* SOCK_TEST */
5510 /* FALLTHRU */
5511 case SO_REUSEADDR:
5512 case SO_KEEPALIVE:
5513 case SO_DONTROUTE:
5514 case SO_BROADCAST:
5515 case SO_USELOOPBACK:
5516 case SO_OOBINLINE:
5517 case SO_DGRAM_ERRIND:
5518 if (intvalue != 0) {
5519 dprintso(so, 1,
5520 ("socket_setsockopt: setting 0x%x\n",
5521 option_name));
5522 so->so_options |= option_name;
5523 } else {
5524 dprintso(so, 1,
5525 ("socket_setsockopt: clearing 0x%x\n",
5526 option_name));
5527 so->so_options &= ~option_name;
5528 }
5529 break;
5530 /*
5531 * The following options are only returned by us when the
5532 * transport layer fails.
5533 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5534 * since the transport might adjust the value and not
5535 * return exactly what was set by the application.
5536 */
5537 case SO_SNDBUF:
5538 so->so_sndbuf = intvalue;
5539 break;
5540 case SO_RCVBUF:
5541 so->so_rcvbuf = intvalue;
5542 break;
5543 case SO_RCVPSH:
5544 so->so_rcv_timer_interval = intvalue;
5545 break;
5546 #ifdef notyet
5547 /*
5548 * We do not implement the semantics of these options
5549 * thus we shouldn't implement the options either.
5550 */
5551 case SO_SNDLOWAT:
5552 so->so_sndlowat = intvalue;
5553 break;
5554 case SO_RCVLOWAT:
5555 so->so_rcvlowat = intvalue;
5556 break;
5557 #endif /* notyet */
5558 case SO_SNDTIMEO:
5559 case SO_RCVTIMEO: {
5560 struct timeval tl;
5561 clock_t val;
5562
5563 if (get_udatamodel() == DATAMODEL_NONE ||
5564 get_udatamodel() == DATAMODEL_NATIVE) {
5565 bcopy((struct timeval *)optval, &tl,
5566 sizeof (struct timeval));
5567 } else {
5568 TIMEVAL32_TO_TIMEVAL(&tl,
5569 (struct timeval32 *)optval);
5570 }
5571 val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5572 if (option_name == SO_RCVTIMEO)
5573 so->so_rcvtimeo = drv_usectohz(val);
5574 else
5575 so->so_sndtimeo = drv_usectohz(val);
5576 break;
5577 }
5578 }
5579 #undef intvalue
5580
5581 if (error) {
5582 if ((error == ENOPROTOOPT || error == EPROTO ||
5583 error == EINVAL) && handled) {
5584 dprintso(so, 1,
5585 ("setsockopt: ignoring error %d for 0x%x\n",
5586 error, option_name));
5587 error = 0;
5588 }
5589 }
5590 }
5591 done2:
5592 so_unlock_single(so, SOLOCKED);
5593 mutex_exit(&so->so_lock);
5594 return (error);
5595 }
5596
5597 /*
5598 * sotpi_close() is called when the last open reference goes away.
5599 */
5600 /* ARGSUSED */
5601 int
sotpi_close(struct sonode * so,int flag,struct cred * cr)5602 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5603 {
5604 struct vnode *vp = SOTOV(so);
5605 dev_t dev;
5606 int error = 0;
5607 sotpi_info_t *sti = SOTOTPI(so);
5608
5609 dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5610 (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5611
5612 dev = sti->sti_dev;
5613
5614 ASSERT(STREAMSTAB(getmajor(dev)));
5615
5616 mutex_enter(&so->so_lock);
5617 so_lock_single(so); /* Set SOLOCKED */
5618
5619 ASSERT(so_verify_oobstate(so));
5620
5621 if (vp->v_stream != NULL) {
5622 vnode_t *ux_vp;
5623
5624 if (so->so_family == AF_UNIX) {
5625 /* Could avoid this when CANTSENDMORE for !dgram */
5626 so_unix_close(so);
5627 }
5628
5629 mutex_exit(&so->so_lock);
5630 /*
5631 * Disassemble the linkage from the AF_UNIX underlying file
5632 * system vnode to this socket (by atomically clearing
5633 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5634 * and frees the stream head.
5635 */
5636 if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5637 ASSERT(ux_vp->v_stream);
5638 sti->sti_ux_bound_vp = NULL;
5639 vn_rele_stream(ux_vp);
5640 }
5641 error = strclose(vp, flag, cr);
5642 vp->v_stream = NULL;
5643 mutex_enter(&so->so_lock);
5644 }
5645
5646 /*
5647 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5648 */
5649 so_flush_discon_ind(so);
5650
5651 so_unlock_single(so, SOLOCKED);
5652 mutex_exit(&so->so_lock);
5653
5654 /*
5655 * Needed for STREAMs.
5656 * Decrement the device driver's reference count for streams
5657 * opened via the clone dip. The driver was held in clone_open().
5658 * The absence of clone_close() forces this asymmetry.
5659 */
5660 if (so->so_flag & SOCLONE)
5661 ddi_rele_driver(getmajor(dev));
5662
5663 return (error);
5664 }
5665
5666 static int
sotpi_ioctl(struct sonode * so,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5667 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5668 struct cred *cr, int32_t *rvalp)
5669 {
5670 struct vnode *vp = SOTOV(so);
5671 sotpi_info_t *sti = SOTOTPI(so);
5672 int error = 0;
5673
5674 dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5675 cmd, arg, pr_state(so->so_state, so->so_mode)));
5676
5677 switch (cmd) {
5678 case SIOCSQPTR:
5679 /*
5680 * SIOCSQPTR is valid only when helper stream is created
5681 * by the protocol.
5682 */
5683 case _I_INSERT:
5684 case _I_REMOVE:
5685 /*
5686 * Since there's no compelling reason to support these ioctls
5687 * on sockets, and doing so would increase the complexity
5688 * markedly, prevent it.
5689 */
5690 return (EOPNOTSUPP);
5691
5692 case I_FIND:
5693 case I_LIST:
5694 case I_LOOK:
5695 case I_POP:
5696 case I_PUSH:
5697 /*
5698 * To prevent races and inconsistencies between the actual
5699 * state of the stream and the state according to the sonode,
5700 * we serialize all operations which modify or operate on the
5701 * list of modules on the socket's stream.
5702 */
5703 mutex_enter(&sti->sti_plumb_lock);
5704 error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5705 mutex_exit(&sti->sti_plumb_lock);
5706 return (error);
5707
5708 default:
5709 if (so->so_version != SOV_STREAM)
5710 break;
5711
5712 /*
5713 * The imaginary "sockmod" has been popped; act as a stream.
5714 */
5715 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5716 }
5717
5718 ASSERT(so->so_version != SOV_STREAM);
5719
5720 /*
5721 * Process socket-specific ioctls.
5722 */
5723 switch (cmd) {
5724 case FIONBIO: {
5725 int32_t value;
5726
5727 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5728 (mode & (int)FKIOCTL)))
5729 return (EFAULT);
5730
5731 mutex_enter(&so->so_lock);
5732 if (value) {
5733 so->so_state |= SS_NDELAY;
5734 } else {
5735 so->so_state &= ~SS_NDELAY;
5736 }
5737 mutex_exit(&so->so_lock);
5738 return (0);
5739 }
5740
5741 case FIOASYNC: {
5742 int32_t value;
5743
5744 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5745 (mode & (int)FKIOCTL)))
5746 return (EFAULT);
5747
5748 mutex_enter(&so->so_lock);
5749 /*
5750 * SS_ASYNC flag not already set correctly?
5751 * (!value != !(so->so_state & SS_ASYNC))
5752 * but some engineers find that too hard to read.
5753 */
5754 if ((value == 0 && (so->so_state & SS_ASYNC) != 0) ||
5755 (value != 0 && (so->so_state & SS_ASYNC) == 0))
5756 error = so_flip_async(so, vp, mode, cr);
5757 mutex_exit(&so->so_lock);
5758 return (error);
5759 }
5760
5761 case SIOCSPGRP:
5762 case FIOSETOWN: {
5763 pid_t pgrp;
5764
5765 if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5766 (mode & (int)FKIOCTL)))
5767 return (EFAULT);
5768
5769 mutex_enter(&so->so_lock);
5770 dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5771 /* Any change? */
5772 if (pgrp != so->so_pgrp)
5773 error = so_set_siggrp(so, vp, pgrp, mode, cr);
5774 mutex_exit(&so->so_lock);
5775 return (error);
5776 }
5777 case SIOCGPGRP:
5778 case FIOGETOWN:
5779 if (so_copyout(&so->so_pgrp, (void *)arg,
5780 sizeof (pid_t), (mode & (int)FKIOCTL)))
5781 return (EFAULT);
5782 return (0);
5783
5784 case SIOCATMARK: {
5785 int retval;
5786 uint_t so_state;
5787
5788 /*
5789 * strwaitmark has a finite timeout after which it
5790 * returns -1 if the mark state is undetermined.
5791 * In order to avoid any race between the mark state
5792 * in sockfs and the mark state in the stream head this
5793 * routine loops until the mark state can be determined
5794 * (or the urgent data indication has been removed by some
5795 * other thread).
5796 */
5797 do {
5798 mutex_enter(&so->so_lock);
5799 so_state = so->so_state;
5800 mutex_exit(&so->so_lock);
5801 if (so_state & SS_RCVATMARK) {
5802 retval = 1;
5803 } else if (!(so_state & SS_OOBPEND)) {
5804 /*
5805 * No SIGURG has been generated -- there is no
5806 * pending or present urgent data. Thus can't
5807 * possibly be at the mark.
5808 */
5809 retval = 0;
5810 } else {
5811 /*
5812 * Have the stream head wait until there is
5813 * either some messages on the read queue, or
5814 * STRATMARK or STRNOTATMARK gets set. The
5815 * STRNOTATMARK flag is used so that the
5816 * transport can send up a MSGNOTMARKNEXT
5817 * M_DATA to indicate that it is not
5818 * at the mark and additional data is not about
5819 * to be send upstream.
5820 *
5821 * If the mark state is undetermined this will
5822 * return -1 and we will loop rechecking the
5823 * socket state.
5824 */
5825 retval = strwaitmark(vp);
5826 }
5827 } while (retval == -1);
5828
5829 if (so_copyout(&retval, (void *)arg, sizeof (int),
5830 (mode & (int)FKIOCTL)))
5831 return (EFAULT);
5832 return (0);
5833 }
5834
5835 case I_FDINSERT:
5836 case I_SENDFD:
5837 case I_RECVFD:
5838 case I_ATMARK:
5839 case _SIOCSOCKFALLBACK:
5840 /*
5841 * These ioctls do not apply to sockets. I_FDINSERT can be
5842 * used to send M_PROTO messages without modifying the socket
5843 * state. I_SENDFD/RECVFD should not be used for socket file
5844 * descriptor passing since they assume a twisted stream.
5845 * SIOCATMARK must be used instead of I_ATMARK.
5846 *
5847 * _SIOCSOCKFALLBACK from an application should never be
5848 * processed. It is only generated by socktpi_open() or
5849 * in response to I_POP or I_PUSH.
5850 */
5851 #ifdef DEBUG
5852 zcmn_err(getzoneid(), CE_WARN,
5853 "Unsupported STREAMS ioctl 0x%x on socket. "
5854 "Pid = %d\n", cmd, curproc->p_pid);
5855 #endif /* DEBUG */
5856 return (EOPNOTSUPP);
5857
5858 case _I_GETPEERCRED:
5859 if ((mode & FKIOCTL) == 0)
5860 return (EINVAL);
5861
5862 mutex_enter(&so->so_lock);
5863 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
5864 error = ENOTSUP;
5865 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
5866 error = ENOTCONN;
5867 } else if (so->so_peercred != NULL) {
5868 k_peercred_t *kp = (k_peercred_t *)arg;
5869 kp->pc_cr = so->so_peercred;
5870 kp->pc_cpid = so->so_cpid;
5871 crhold(so->so_peercred);
5872 } else {
5873 error = EINVAL;
5874 }
5875 mutex_exit(&so->so_lock);
5876 return (error);
5877
5878 default:
5879 /*
5880 * Do the higher-order bits of the ioctl cmd indicate
5881 * that it is an I_* streams ioctl?
5882 */
5883 if ((cmd & 0xffffff00U) == STR &&
5884 so->so_version == SOV_SOCKBSD) {
5885 #ifdef DEBUG
5886 zcmn_err(getzoneid(), CE_WARN,
5887 "Unsupported STREAMS ioctl 0x%x on socket. "
5888 "Pid = %d\n", cmd, curproc->p_pid);
5889 #endif /* DEBUG */
5890 return (EOPNOTSUPP);
5891 }
5892 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5893 }
5894 }
5895
5896 /*
5897 * Handle plumbing-related ioctls.
5898 */
5899 static int
socktpi_plumbioctl(struct vnode * vp,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5900 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
5901 struct cred *cr, int32_t *rvalp)
5902 {
5903 static const char sockmod_name[] = "sockmod";
5904 struct sonode *so = VTOSO(vp);
5905 char mname[FMNAMESZ + 1];
5906 int error;
5907 sotpi_info_t *sti = SOTOTPI(so);
5908
5909 ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
5910
5911 if (so->so_version == SOV_SOCKBSD)
5912 return (EOPNOTSUPP);
5913
5914 if (so->so_version == SOV_STREAM) {
5915 /*
5916 * The imaginary "sockmod" has been popped - act as a stream.
5917 * If this is a push of sockmod then change back to a socket.
5918 */
5919 if (cmd == I_PUSH) {
5920 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
5921 (void *)arg, mname, sizeof (mname), NULL);
5922
5923 if (error == 0 && strcmp(mname, sockmod_name) == 0) {
5924 dprintso(so, 0, ("socktpi_ioctl: going to "
5925 "socket version\n"));
5926 so_stream2sock(so);
5927 return (0);
5928 }
5929 }
5930 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5931 }
5932
5933 switch (cmd) {
5934 case I_PUSH:
5935 if (sti->sti_direct) {
5936 mutex_enter(&so->so_lock);
5937 so_lock_single(so);
5938 mutex_exit(&so->so_lock);
5939
5940 error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
5941 cr, rvalp);
5942
5943 mutex_enter(&so->so_lock);
5944 if (error == 0)
5945 sti->sti_direct = 0;
5946 so_unlock_single(so, SOLOCKED);
5947 mutex_exit(&so->so_lock);
5948
5949 if (error != 0)
5950 return (error);
5951 }
5952
5953 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5954 if (error == 0)
5955 sti->sti_pushcnt++;
5956 return (error);
5957
5958 case I_POP:
5959 if (sti->sti_pushcnt == 0) {
5960 /* Emulate sockmod being popped */
5961 dprintso(so, 0,
5962 ("socktpi_ioctl: going to STREAMS version\n"));
5963 return (so_sock2stream(so));
5964 }
5965
5966 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5967 if (error == 0)
5968 sti->sti_pushcnt--;
5969 return (error);
5970
5971 case I_LIST: {
5972 struct str_mlist *kmlistp, *umlistp;
5973 struct str_list kstrlist;
5974 ssize_t kstrlistsize;
5975 int i, nmods;
5976
5977 STRUCT_DECL(str_list, ustrlist);
5978 STRUCT_INIT(ustrlist, mode);
5979
5980 if (arg == 0) {
5981 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5982 if (error == 0)
5983 (*rvalp)++; /* Add one for sockmod */
5984 return (error);
5985 }
5986
5987 error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
5988 STRUCT_SIZE(ustrlist), mode & FKIOCTL);
5989 if (error != 0)
5990 return (error);
5991
5992 nmods = STRUCT_FGET(ustrlist, sl_nmods);
5993 if (nmods <= 0)
5994 return (EINVAL);
5995 /*
5996 * Ceiling nmods at nstrpush to prevent someone from
5997 * maliciously consuming lots of kernel memory.
5998 */
5999 nmods = MIN(nmods, nstrpush);
6000
6001 kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6002 kstrlist.sl_nmods = nmods;
6003 kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6004
6005 error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6006 cr, rvalp);
6007 if (error != 0)
6008 goto done;
6009
6010 /*
6011 * Considering the module list as a 0-based array of sl_nmods
6012 * modules, sockmod should conceptually exist at slot
6013 * sti_pushcnt. Insert sockmod at this location by sliding all
6014 * of the module names after so_pushcnt over by one. We know
6015 * that there will be room to do this since we allocated
6016 * sl_modlist with an additional slot.
6017 */
6018 for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6019 kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6020
6021 (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6022 kstrlist.sl_nmods++;
6023
6024 /*
6025 * Copy all of the entries out to ustrlist.
6026 */
6027 kmlistp = kstrlist.sl_modlist;
6028 umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6029 for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6030 error = so_copyout(kmlistp++, umlistp++,
6031 sizeof (struct str_mlist), mode & FKIOCTL);
6032 if (error != 0)
6033 goto done;
6034 }
6035
6036 error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6037 mode & FKIOCTL);
6038 if (error == 0)
6039 *rvalp = 0;
6040 done:
6041 kmem_free(kstrlist.sl_modlist, kstrlistsize);
6042 return (error);
6043 }
6044 case I_LOOK:
6045 if (sti->sti_pushcnt == 0) {
6046 return (so_copyout(sockmod_name, (void *)arg,
6047 sizeof (sockmod_name), mode & FKIOCTL));
6048 }
6049 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6050
6051 case I_FIND:
6052 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6053 if (error && error != EINVAL)
6054 return (error);
6055
6056 /* if not found and string was sockmod return 1 */
6057 if (*rvalp == 0 || error == EINVAL) {
6058 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6059 (void *)arg, mname, sizeof (mname), NULL);
6060 if (error == ENAMETOOLONG)
6061 error = EINVAL;
6062
6063 if (error == 0 && strcmp(mname, sockmod_name) == 0)
6064 *rvalp = 1;
6065 }
6066 return (error);
6067
6068 default:
6069 panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6070 break;
6071 }
6072
6073 return (0);
6074 }
6075
6076 /*
6077 * Wrapper around the streams poll routine that implements socket poll
6078 * semantics.
6079 * The sockfs never calls pollwakeup itself - the stream head take care
6080 * of all pollwakeups. Since sockfs never holds so_lock when calling the
6081 * stream head there can never be a deadlock due to holding so_lock across
6082 * pollwakeup and acquiring so_lock in this routine.
6083 *
6084 * However, since the performance of VOP_POLL is critical we avoid
6085 * acquiring so_lock here. This is based on two assumptions:
6086 * - The poll implementation holds locks to serialize the VOP_POLL call
6087 * and a pollwakeup for the same pollhead. This ensures that should
6088 * e.g. so_state change during a socktpi_poll call the pollwakeup
6089 * (which strsock_* and strrput conspire to issue) is issued after
6090 * the state change. Thus the pollwakeup will block until VOP_POLL has
6091 * returned and then wake up poll and have it call VOP_POLL again.
6092 * - The reading of so_state without holding so_lock does not result in
6093 * stale data that is older than the latest state change that has dropped
6094 * so_lock. This is ensured by the mutex_exit issuing the appropriate
6095 * memory barrier to force the data into the coherency domain.
6096 */
6097 static int
sotpi_poll(struct sonode * so,short events,int anyyet,short * reventsp,struct pollhead ** phpp)6098 sotpi_poll(
6099 struct sonode *so,
6100 short events,
6101 int anyyet,
6102 short *reventsp,
6103 struct pollhead **phpp)
6104 {
6105 short origevents = events;
6106 struct vnode *vp = SOTOV(so);
6107 int error;
6108 int so_state = so->so_state; /* snapshot */
6109 sotpi_info_t *sti = SOTOTPI(so);
6110
6111 dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6112 (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6113
6114 ASSERT(vp->v_type == VSOCK);
6115 ASSERT(vp->v_stream != NULL);
6116
6117 if (so->so_version == SOV_STREAM) {
6118 /* The imaginary "sockmod" has been popped - act as a stream */
6119 return (strpoll(vp->v_stream, events, anyyet,
6120 reventsp, phpp));
6121 }
6122
6123 if (!(so_state & SS_ISCONNECTED) &&
6124 (so->so_mode & SM_CONNREQUIRED)) {
6125 /* Not connected yet - turn off write side events */
6126 events &= ~(POLLOUT|POLLWRBAND);
6127 }
6128 /*
6129 * Check for errors without calling strpoll if the caller wants them.
6130 * In sockets the errors are represented as input/output events
6131 * and there is no need to ask the stream head for this information.
6132 */
6133 if (so->so_error != 0 &&
6134 ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
6135 *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6136 return (0);
6137 }
6138 /*
6139 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6140 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6141 * will not trigger a POLLIN event with POLLRDDATA set.
6142 * The handling of urgent data (causing POLLRDBAND) is done by
6143 * inspecting SS_OOBPEND below.
6144 */
6145 events |= POLLRDDATA;
6146
6147 /*
6148 * After shutdown(output) a stream head write error is set.
6149 * However, we should not return output events.
6150 */
6151 events |= POLLNOERR;
6152 error = strpoll(vp->v_stream, events, anyyet,
6153 reventsp, phpp);
6154 if (error)
6155 return (error);
6156
6157 ASSERT(!(*reventsp & POLLERR));
6158
6159 /*
6160 * Notes on T_CONN_IND handling for sockets.
6161 *
6162 * If strpoll() returned without events, SR_POLLIN is guaranteed
6163 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6164 *
6165 * Since the so_lock is not held, soqueueconnind() may have run
6166 * and a T_CONN_IND may be waiting. We now check for any queued
6167 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6168 * to ensure poll returns.
6169 *
6170 * However:
6171 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6172 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6173 * the following actions will occur; taken together they ensure the
6174 * syscall will return.
6175 *
6176 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6177 * the accept() was run on a non-blocking socket sowaitconnind()
6178 * may have already returned EWOULDBLOCK, so not be waiting to
6179 * process the message. Additionally socktpi_poll() has probably
6180 * proceeded past the sti_conn_ind_head check below.
6181 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6182 * this thread, however that could occur before poll_common()
6183 * has entered cv_wait.
6184 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6185 *
6186 * Before proceeding to cv_wait() in poll_common() for an event,
6187 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6188 * and if set, re-calls strpoll() to ensure the late arriving
6189 * T_CONN_IND is recognized, and pollsys() returns.
6190 */
6191
6192 if (sti->sti_conn_ind_head != NULL)
6193 *reventsp |= (POLLIN|POLLRDNORM) & events;
6194
6195 if (so->so_state & SS_CANTRCVMORE) {
6196 *reventsp |= POLLRDHUP & events;
6197
6198 if (so->so_state & SS_CANTSENDMORE)
6199 *reventsp |= POLLHUP;
6200 }
6201
6202 if (so->so_state & SS_OOBPEND)
6203 *reventsp |= POLLRDBAND & events;
6204
6205 return (0);
6206 }
6207
6208 /*ARGSUSED*/
6209 static int
socktpi_constructor(void * buf,void * cdrarg,int kmflags)6210 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6211 {
6212 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6213 int error = 0;
6214
6215 error = sonode_constructor(buf, cdrarg, kmflags);
6216 if (error != 0)
6217 return (error);
6218
6219 error = i_sotpi_info_constructor(&st->st_info);
6220 if (error != 0)
6221 sonode_destructor(buf, cdrarg);
6222
6223 st->st_sonode.so_priv = &st->st_info;
6224
6225 return (error);
6226 }
6227
6228 /*ARGSUSED1*/
6229 static void
socktpi_destructor(void * buf,void * cdrarg)6230 socktpi_destructor(void *buf, void *cdrarg)
6231 {
6232 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6233
6234 ASSERT(st->st_sonode.so_priv == &st->st_info);
6235 st->st_sonode.so_priv = NULL;
6236
6237 i_sotpi_info_destructor(&st->st_info);
6238 sonode_destructor(buf, cdrarg);
6239 }
6240
6241 static int
socktpi_unix_constructor(void * buf,void * cdrarg,int kmflags)6242 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6243 {
6244 int retval;
6245
6246 if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6247 struct sonode *so = (struct sonode *)buf;
6248 sotpi_info_t *sti = SOTOTPI(so);
6249
6250 mutex_enter(&socklist.sl_lock);
6251
6252 sti->sti_next_so = socklist.sl_list;
6253 sti->sti_prev_so = NULL;
6254 if (sti->sti_next_so != NULL)
6255 SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6256 socklist.sl_list = so;
6257
6258 mutex_exit(&socklist.sl_lock);
6259
6260 }
6261 return (retval);
6262 }
6263
6264 static void
socktpi_unix_destructor(void * buf,void * cdrarg)6265 socktpi_unix_destructor(void *buf, void *cdrarg)
6266 {
6267 struct sonode *so = (struct sonode *)buf;
6268 sotpi_info_t *sti = SOTOTPI(so);
6269
6270 mutex_enter(&socklist.sl_lock);
6271
6272 if (sti->sti_next_so != NULL)
6273 SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6274 if (sti->sti_prev_so != NULL)
6275 SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6276 else
6277 socklist.sl_list = sti->sti_next_so;
6278
6279 mutex_exit(&socklist.sl_lock);
6280
6281 socktpi_destructor(buf, cdrarg);
6282 }
6283
6284 int
socktpi_init(void)6285 socktpi_init(void)
6286 {
6287 /*
6288 * Create sonode caches. We create a special one for AF_UNIX so
6289 * that we can track them for netstat(8).
6290 */
6291 socktpi_cache = kmem_cache_create("socktpi_cache",
6292 sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6293 socktpi_destructor, NULL, NULL, NULL, 0);
6294
6295 socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6296 sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6297 socktpi_unix_destructor, NULL, NULL, NULL, 0);
6298
6299 return (0);
6300 }
6301
6302 /*
6303 * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6304 *
6305 * Caller must still update state and mode using sotpi_update_state().
6306 */
6307 int
sotpi_convert_sonode(struct sonode * so,struct sockparams * newsp,boolean_t * direct,queue_t ** qp,struct cred * cr)6308 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6309 boolean_t *direct, queue_t **qp, struct cred *cr)
6310 {
6311 sotpi_info_t *sti;
6312 struct sockparams *origsp = so->so_sockparams;
6313 sock_lower_handle_t handle = so->so_proto_handle;
6314 struct stdata *stp;
6315 struct vnode *vp;
6316 queue_t *q;
6317 int error = 0;
6318
6319 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6320 SS_FALLBACK_PENDING);
6321 ASSERT(SOCK_IS_NONSTR(so));
6322
6323 *qp = NULL;
6324 *direct = B_FALSE;
6325 so->so_sockparams = newsp;
6326 /*
6327 * Allocate and initalize fields required by TPI.
6328 */
6329 (void) sotpi_info_create(so, KM_SLEEP);
6330 sotpi_info_init(so);
6331
6332 if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6333 sotpi_info_fini(so);
6334 sotpi_info_destroy(so);
6335 return (error);
6336 }
6337 ASSERT(handle == so->so_proto_handle);
6338 sti = SOTOTPI(so);
6339 if (sti->sti_direct != 0)
6340 *direct = B_TRUE;
6341
6342 /*
6343 * Keep the original sp around so we can properly dispose of the
6344 * sonode when the socket is being closed.
6345 */
6346 sti->sti_orig_sp = origsp;
6347
6348 so_basic_strinit(so); /* skips the T_CAPABILITY_REQ */
6349 so_alloc_addr(so, so->so_max_addr_len);
6350
6351 /*
6352 * If the application has done a SIOCSPGRP, make sure the
6353 * STREAM head is aware. This needs to take place before
6354 * the protocol start sending up messages. Otherwise we
6355 * might miss to generate SIGPOLL.
6356 *
6357 * It is possible that the application will receive duplicate
6358 * signals if some were already generated for either data or
6359 * connection indications.
6360 */
6361 if (so->so_pgrp != 0) {
6362 if (so_set_events(so, so->so_vnode, cr) != 0)
6363 so->so_pgrp = 0;
6364 }
6365
6366 /*
6367 * Determine which queue to use.
6368 */
6369 vp = SOTOV(so);
6370 stp = vp->v_stream;
6371 ASSERT(stp != NULL);
6372 q = stp->sd_wrq->q_next;
6373
6374 /*
6375 * Skip any modules that may have been auto pushed when the device
6376 * was opened
6377 */
6378 while (q->q_next != NULL)
6379 q = q->q_next;
6380 *qp = _RD(q);
6381
6382 /* This is now a STREAMS sockets */
6383 so->so_not_str = B_FALSE;
6384
6385 return (error);
6386 }
6387
6388 /*
6389 * Revert a TPI sonode. It is only allowed to revert the sonode during
6390 * the fallback process.
6391 */
6392 void
sotpi_revert_sonode(struct sonode * so,struct cred * cr)6393 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6394 {
6395 vnode_t *vp = SOTOV(so);
6396
6397 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6398 SS_FALLBACK_PENDING);
6399 ASSERT(!SOCK_IS_NONSTR(so));
6400 ASSERT(vp->v_stream != NULL);
6401
6402 strclean(vp);
6403 (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6404
6405 /*
6406 * Restore the original sockparams. The caller is responsible for
6407 * dropping the ref to the new sp.
6408 */
6409 so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6410
6411 sotpi_info_fini(so);
6412 sotpi_info_destroy(so);
6413
6414 /* This is no longer a STREAMS sockets */
6415 so->so_not_str = B_TRUE;
6416 }
6417
6418 void
sotpi_update_state(struct sonode * so,struct T_capability_ack * tcap,struct sockaddr * laddr,socklen_t laddrlen,struct sockaddr * faddr,socklen_t faddrlen,short opts)6419 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6420 struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6421 socklen_t faddrlen, short opts)
6422 {
6423 sotpi_info_t *sti = SOTOTPI(so);
6424
6425 so_proc_tcapability_ack(so, tcap);
6426
6427 so->so_options |= opts;
6428
6429 /*
6430 * Determine whether the foreign and local address are valid
6431 */
6432 if (laddrlen != 0) {
6433 ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6434 sti->sti_laddr_len = laddrlen;
6435 bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6436 sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6437 }
6438
6439 if (faddrlen != 0) {
6440 ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6441 sti->sti_faddr_len = faddrlen;
6442 bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6443 sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6444 }
6445
6446 }
6447
6448 /*
6449 * Allocate enough space to cache the local and foreign addresses.
6450 */
6451 void
so_alloc_addr(struct sonode * so,t_uscalar_t maxlen)6452 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6453 {
6454 sotpi_info_t *sti = SOTOTPI(so);
6455
6456 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6457 ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6458 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6459 P2ROUNDUP(maxlen, KMEM_ALIGN);
6460 so->so_max_addr_len = sti->sti_laddr_maxlen;
6461 sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6462 sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6463 + sti->sti_laddr_maxlen);
6464
6465 if (so->so_family == AF_UNIX) {
6466 /*
6467 * Initialize AF_UNIX related fields.
6468 */
6469 bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6470 bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6471 }
6472 }
6473
6474
6475 sotpi_info_t *
sotpi_sototpi(struct sonode * so)6476 sotpi_sototpi(struct sonode *so)
6477 {
6478 sotpi_info_t *sti;
6479
6480 ASSERT(so != NULL);
6481
6482 sti = (sotpi_info_t *)so->so_priv;
6483
6484 ASSERT(sti != NULL);
6485 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6486
6487 return (sti);
6488 }
6489
6490 static int
i_sotpi_info_constructor(sotpi_info_t * sti)6491 i_sotpi_info_constructor(sotpi_info_t *sti)
6492 {
6493 sti->sti_magic = SOTPI_INFO_MAGIC;
6494 sti->sti_ack_mp = NULL;
6495 sti->sti_discon_ind_mp = NULL;
6496 sti->sti_ux_bound_vp = NULL;
6497 sti->sti_unbind_mp = NULL;
6498
6499 sti->sti_conn_ind_head = NULL;
6500 sti->sti_conn_ind_tail = NULL;
6501
6502 sti->sti_laddr_sa = NULL;
6503 sti->sti_faddr_sa = NULL;
6504
6505 mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6506 cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6507
6508 return (0);
6509 }
6510
6511 static void
i_sotpi_info_destructor(sotpi_info_t * sti)6512 i_sotpi_info_destructor(sotpi_info_t *sti)
6513 {
6514 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6515 ASSERT(sti->sti_ack_mp == NULL);
6516 ASSERT(sti->sti_discon_ind_mp == NULL);
6517 ASSERT(sti->sti_ux_bound_vp == NULL);
6518 ASSERT(sti->sti_unbind_mp == NULL);
6519
6520 ASSERT(sti->sti_conn_ind_head == NULL);
6521 ASSERT(sti->sti_conn_ind_tail == NULL);
6522
6523 ASSERT(sti->sti_laddr_sa == NULL);
6524 ASSERT(sti->sti_faddr_sa == NULL);
6525
6526 mutex_destroy(&sti->sti_plumb_lock);
6527 cv_destroy(&sti->sti_ack_cv);
6528 }
6529
6530 /*
6531 * Creates and attaches TPI information to the given sonode
6532 */
6533 static boolean_t
sotpi_info_create(struct sonode * so,int kmflags)6534 sotpi_info_create(struct sonode *so, int kmflags)
6535 {
6536 sotpi_info_t *sti;
6537
6538 ASSERT(so->so_priv == NULL);
6539
6540 if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6541 return (B_FALSE);
6542
6543 if (i_sotpi_info_constructor(sti) != 0) {
6544 kmem_free(sti, sizeof (*sti));
6545 return (B_FALSE);
6546 }
6547
6548 so->so_priv = (void *)sti;
6549 return (B_TRUE);
6550 }
6551
6552 /*
6553 * Initializes the TPI information.
6554 */
6555 static void
sotpi_info_init(struct sonode * so)6556 sotpi_info_init(struct sonode *so)
6557 {
6558 struct vnode *vp = SOTOV(so);
6559 sotpi_info_t *sti = SOTOTPI(so);
6560 time_t now;
6561
6562 sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6563 vp->v_rdev = sti->sti_dev;
6564
6565 sti->sti_orig_sp = NULL;
6566
6567 sti->sti_pushcnt = 0;
6568
6569 now = gethrestime_sec();
6570 sti->sti_atime = now;
6571 sti->sti_mtime = now;
6572 sti->sti_ctime = now;
6573
6574 sti->sti_eaddr_mp = NULL;
6575 sti->sti_delayed_error = 0;
6576
6577 sti->sti_provinfo = NULL;
6578
6579 sti->sti_oobcnt = 0;
6580 sti->sti_oobsigcnt = 0;
6581
6582 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6583
6584 sti->sti_laddr_sa = 0;
6585 sti->sti_faddr_sa = 0;
6586 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6587 sti->sti_laddr_len = sti->sti_faddr_len = 0;
6588
6589 sti->sti_laddr_valid = 0;
6590 sti->sti_faddr_valid = 0;
6591 sti->sti_faddr_noxlate = 0;
6592
6593 sti->sti_direct = 0;
6594
6595 ASSERT(sti->sti_ack_mp == NULL);
6596 ASSERT(sti->sti_ux_bound_vp == NULL);
6597 ASSERT(sti->sti_unbind_mp == NULL);
6598
6599 ASSERT(sti->sti_conn_ind_head == NULL);
6600 ASSERT(sti->sti_conn_ind_tail == NULL);
6601 }
6602
6603 /*
6604 * Given a sonode, grab the TPI info and free any data.
6605 */
6606 static void
sotpi_info_fini(struct sonode * so)6607 sotpi_info_fini(struct sonode *so)
6608 {
6609 sotpi_info_t *sti = SOTOTPI(so);
6610 mblk_t *mp;
6611
6612 ASSERT(sti->sti_discon_ind_mp == NULL);
6613
6614 if ((mp = sti->sti_conn_ind_head) != NULL) {
6615 mblk_t *mp1;
6616
6617 while (mp) {
6618 mp1 = mp->b_next;
6619 mp->b_next = NULL;
6620 freemsg(mp);
6621 mp = mp1;
6622 }
6623 sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6624 }
6625
6626 /*
6627 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6628 * indirect them. It also uses so_count as a validity test.
6629 */
6630 mutex_enter(&so->so_lock);
6631
6632 if (sti->sti_laddr_sa) {
6633 ASSERT((caddr_t)sti->sti_faddr_sa ==
6634 (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6635 ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6636 sti->sti_laddr_valid = 0;
6637 sti->sti_faddr_valid = 0;
6638 kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6639 sti->sti_laddr_sa = NULL;
6640 sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6641 sti->sti_faddr_sa = NULL;
6642 sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6643 }
6644
6645 mutex_exit(&so->so_lock);
6646
6647 if ((mp = sti->sti_eaddr_mp) != NULL) {
6648 freemsg(mp);
6649 sti->sti_eaddr_mp = NULL;
6650 sti->sti_delayed_error = 0;
6651 }
6652
6653 if ((mp = sti->sti_ack_mp) != NULL) {
6654 freemsg(mp);
6655 sti->sti_ack_mp = NULL;
6656 }
6657
6658 ASSERT(sti->sti_ux_bound_vp == NULL);
6659 if ((mp = sti->sti_unbind_mp) != NULL) {
6660 freemsg(mp);
6661 sti->sti_unbind_mp = NULL;
6662 }
6663 }
6664
6665 /*
6666 * Destroys the TPI information attached to a sonode.
6667 */
6668 static void
sotpi_info_destroy(struct sonode * so)6669 sotpi_info_destroy(struct sonode *so)
6670 {
6671 sotpi_info_t *sti = SOTOTPI(so);
6672
6673 i_sotpi_info_destructor(sti);
6674 kmem_free(sti, sizeof (*sti));
6675
6676 so->so_priv = NULL;
6677 }
6678
6679 /*
6680 * Create the global sotpi socket module entry. It will never be freed.
6681 */
6682 smod_info_t *
sotpi_smod_create(void)6683 sotpi_smod_create(void)
6684 {
6685 smod_info_t *smodp;
6686
6687 smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6688 smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6689 (void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6690 /*
6691 * Initialize the smod_refcnt to 1 so it will never be freed.
6692 */
6693 smodp->smod_refcnt = 1;
6694 smodp->smod_uc_version = SOCK_UC_VERSION;
6695 smodp->smod_dc_version = SOCK_DC_VERSION;
6696 smodp->smod_sock_create_func = &sotpi_create;
6697 smodp->smod_sock_destroy_func = &sotpi_destroy;
6698 return (smodp);
6699 }
6700