1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 */
27
28 #include <sys/types.h>
29 #include <sys/t_lock.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/cred.h>
35 #include <sys/kmem.h>
36 #include <sys/kmem_impl.h>
37 #include <sys/sysmacros.h>
38 #include <sys/vfs.h>
39 #include <sys/vnode.h>
40 #include <sys/debug.h>
41 #include <sys/errno.h>
42 #include <sys/time.h>
43 #include <sys/file.h>
44 #include <sys/open.h>
45 #include <sys/user.h>
46 #include <sys/termios.h>
47 #include <sys/stream.h>
48 #include <sys/strsubr.h>
49 #include <sys/strsun.h>
50 #include <sys/suntpi.h>
51 #include <sys/ddi.h>
52 #include <sys/esunddi.h>
53 #include <sys/flock.h>
54 #include <sys/modctl.h>
55 #include <sys/vtrace.h>
56 #include <sys/cmn_err.h>
57 #include <sys/pathname.h>
58
59 #include <sys/socket.h>
60 #include <sys/socketvar.h>
61 #include <sys/sockio.h>
62 #include <netinet/in.h>
63 #include <sys/un.h>
64 #include <sys/strsun.h>
65
66 #include <sys/tiuser.h>
67 #define _SUN_TPI_VERSION 2
68 #include <sys/tihdr.h>
69 #include <sys/timod.h> /* TI_GETMYNAME, TI_GETPEERNAME */
70
71 #include <c2/audit.h>
72
73 #include <inet/common.h>
74 #include <inet/ip.h>
75 #include <inet/ip6.h>
76 #include <inet/tcp.h>
77 #include <inet/udp_impl.h>
78
79 #include <sys/zone.h>
80
81 #include <fs/sockfs/nl7c.h>
82 #include <fs/sockfs/nl7curi.h>
83
84 #include <fs/sockfs/sockcommon.h>
85 #include <fs/sockfs/socktpi.h>
86 #include <fs/sockfs/socktpi_impl.h>
87
88 /*
89 * Possible failures when memory can't be allocated. The documented behavior:
90 *
91 * 5.5: 4.X: XNET:
92 * accept: ENOMEM/ENOSR/EINTR - (EINTR) ENOMEM/ENOBUFS/ENOSR/
93 * EINTR
94 * (4.X does not document EINTR but returns it)
95 * bind: ENOSR - ENOBUFS/ENOSR
96 * connect: EINTR EINTR ENOBUFS/ENOSR/EINTR
97 * getpeername: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
98 * getsockname: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
99 * (4.X getpeername and getsockname do not fail in practice)
100 * getsockopt: ENOMEM/ENOSR - ENOBUFS/ENOSR
101 * listen: - - ENOBUFS
102 * recv: ENOMEM/ENOSR/EINTR EINTR ENOBUFS/ENOMEM/ENOSR/
103 * EINTR
104 * send: ENOMEM/ENOSR/EINTR ENOBUFS/EINTR ENOBUFS/ENOMEM/ENOSR/
105 * EINTR
106 * setsockopt: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
107 * shutdown: ENOMEM/ENOSR - ENOBUFS/ENOSR
108 * socket: ENOMEM/ENOSR ENOBUFS ENOBUFS/ENOMEM/ENOSR
109 * socketpair: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
110 *
111 * Resolution. When allocation fails:
112 * recv: return EINTR
113 * send: return EINTR
114 * connect, accept: EINTR
115 * bind, listen, shutdown (unbind, unix_close, disconnect): sleep
116 * socket, socketpair: ENOBUFS
117 * getpeername, getsockname: sleep
118 * getsockopt, setsockopt: sleep
119 */
120
121 #ifdef SOCK_TEST
122 /*
123 * Variables that make sockfs do something other than the standard TPI
124 * for the AF_INET transports.
125 *
126 * solisten_tpi_tcp:
127 * TCP can handle a O_T_BIND_REQ with an increased backlog even though
128 * the transport is already bound. This is needed to avoid loosing the
129 * port number should listen() do a T_UNBIND_REQ followed by a
130 * O_T_BIND_REQ.
131 *
132 * soconnect_tpi_udp:
133 * UDP and ICMP can handle a T_CONN_REQ.
134 * This is needed to make the sequence of connect(), getsockname()
135 * return the local IP address used to send packets to the connected to
136 * destination.
137 *
138 * soconnect_tpi_tcp:
139 * TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
140 * Set this to non-zero to send TPI conformant messages to TCP in this
141 * respect. This is a performance optimization.
142 *
143 * soaccept_tpi_tcp:
144 * TCP can handle a T_CONN_REQ without the acceptor being bound.
145 * This is a performance optimization that has been picked up in XTI.
146 *
147 * soaccept_tpi_multioptions:
148 * When inheriting SOL_SOCKET options from the listener to the accepting
149 * socket send them as a single message for AF_INET{,6}.
150 */
151 int solisten_tpi_tcp = 0;
152 int soconnect_tpi_udp = 0;
153 int soconnect_tpi_tcp = 0;
154 int soaccept_tpi_tcp = 0;
155 int soaccept_tpi_multioptions = 1;
156 #else /* SOCK_TEST */
157 #define soconnect_tpi_tcp 0
158 #define soconnect_tpi_udp 0
159 #define solisten_tpi_tcp 0
160 #define soaccept_tpi_tcp 0
161 #define soaccept_tpi_multioptions 1
162 #endif /* SOCK_TEST */
163
164 #ifdef SOCK_TEST
165 extern int do_useracc;
166 extern clock_t sock_test_timelimit;
167 #endif /* SOCK_TEST */
168
169 /*
170 * Some X/Open added checks might have to be backed out to keep SunOS 4.X
171 * applications working. Turn on this flag to disable these checks.
172 */
173 int xnet_skip_checks = 0;
174 int xnet_check_print = 0;
175 int xnet_truncate_print = 0;
176
177 static void sotpi_destroy(struct sonode *);
178 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
179 int, int *, cred_t *cr);
180
181 static boolean_t sotpi_info_create(struct sonode *, int);
182 static void sotpi_info_init(struct sonode *);
183 static void sotpi_info_fini(struct sonode *);
184 static void sotpi_info_destroy(struct sonode *);
185
186 /*
187 * Do direct function call to the transport layer below; this would
188 * also allow the transport to utilize read-side synchronous stream
189 * interface if necessary. This is a /etc/system tunable that must
190 * not be modified on a running system. By default this is enabled
191 * for performance reasons and may be disabled for debugging purposes.
192 */
193 boolean_t socktpi_direct = B_TRUE;
194
195 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
196
197 extern void sigintr(k_sigset_t *, int);
198 extern void sigunintr(k_sigset_t *);
199
200 static int sotpi_unbind(struct sonode *, int);
201
202 /* TPI sockfs sonode operations */
203 int sotpi_init(struct sonode *, struct sonode *, struct cred *,
204 int);
205 static int sotpi_accept(struct sonode *, int, struct cred *,
206 struct sonode **);
207 static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
208 int, struct cred *);
209 static int sotpi_listen(struct sonode *, int, struct cred *);
210 static int sotpi_connect(struct sonode *, struct sockaddr *,
211 socklen_t, int, int, struct cred *);
212 extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *,
213 struct uio *, struct cred *);
214 static int sotpi_sendmsg(struct sonode *, struct nmsghdr *,
215 struct uio *, struct cred *);
216 static int sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
217 struct cred *, mblk_t **);
218 static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
219 struct uio *, void *, t_uscalar_t, int);
220 static int sodgram_direct(struct sonode *, struct sockaddr *,
221 socklen_t, struct uio *, int);
222 extern int sotpi_getpeername(struct sonode *, struct sockaddr *,
223 socklen_t *, boolean_t, struct cred *);
224 static int sotpi_getsockname(struct sonode *, struct sockaddr *,
225 socklen_t *, struct cred *);
226 static int sotpi_shutdown(struct sonode *, int, struct cred *);
227 extern int sotpi_getsockopt(struct sonode *, int, int, void *,
228 socklen_t *, int, struct cred *);
229 extern int sotpi_setsockopt(struct sonode *, int, int, const void *,
230 socklen_t, struct cred *);
231 static int sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
232 int32_t *);
233 static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
234 struct cred *, int32_t *);
235 static int sotpi_poll(struct sonode *, short, int, short *,
236 struct pollhead **);
237 static int sotpi_close(struct sonode *, int, struct cred *);
238
239 static int i_sotpi_info_constructor(sotpi_info_t *);
240 static void i_sotpi_info_destructor(sotpi_info_t *);
241
242 sonodeops_t sotpi_sonodeops = {
243 sotpi_init, /* sop_init */
244 sotpi_accept, /* sop_accept */
245 sotpi_bind, /* sop_bind */
246 sotpi_listen, /* sop_listen */
247 sotpi_connect, /* sop_connect */
248 sotpi_recvmsg, /* sop_recvmsg */
249 sotpi_sendmsg, /* sop_sendmsg */
250 sotpi_sendmblk, /* sop_sendmblk */
251 sotpi_getpeername, /* sop_getpeername */
252 sotpi_getsockname, /* sop_getsockname */
253 sotpi_shutdown, /* sop_shutdown */
254 sotpi_getsockopt, /* sop_getsockopt */
255 sotpi_setsockopt, /* sop_setsockopt */
256 sotpi_ioctl, /* sop_ioctl */
257 sotpi_poll, /* sop_poll */
258 sotpi_close, /* sop_close */
259 };
260
261 /*
262 * Return a TPI socket vnode.
263 *
264 * Note that sockets assume that the driver will clone (either itself
265 * or by using the clone driver) i.e. a socket() call will always
266 * result in a new vnode being created.
267 */
268
269 /*
270 * Common create code for socket and accept. If tso is set the values
271 * from that node is used instead of issuing a T_INFO_REQ.
272 */
273
274 /* ARGSUSED */
275 static struct sonode *
sotpi_create(struct sockparams * sp,int family,int type,int protocol,int version,int sflags,int * errorp,cred_t * cr)276 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
277 int version, int sflags, int *errorp, cred_t *cr)
278 {
279 struct sonode *so;
280 kmem_cache_t *cp;
281 int sfamily = family;
282
283 ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
284
285 if (family == AF_NCA) {
286 /*
287 * The request is for an NCA socket so for NL7C use the
288 * INET domain instead and mark NL7C_AF_NCA below.
289 */
290 family = AF_INET;
291 /*
292 * NL7C is not supported in the non-global zone,
293 * we enforce this restriction here.
294 */
295 if (getzoneid() != GLOBAL_ZONEID) {
296 *errorp = ENOTSUP;
297 return (NULL);
298 }
299 }
300
301 /*
302 * to be compatible with old tpi socket implementation ignore
303 * sleep flag (sflags) passed in
304 */
305 cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
306 so = kmem_cache_alloc(cp, KM_SLEEP);
307 if (so == NULL) {
308 *errorp = ENOMEM;
309 return (NULL);
310 }
311
312 sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
313 sotpi_info_init(so);
314
315 if (sfamily == AF_NCA) {
316 SOTOTPI(so)->sti_nl7c_flags = NL7C_AF_NCA;
317 }
318
319 if (version == SOV_DEFAULT)
320 version = so_default_version;
321
322 so->so_version = (short)version;
323 *errorp = 0;
324
325 return (so);
326 }
327
328 static void
sotpi_destroy(struct sonode * so)329 sotpi_destroy(struct sonode *so)
330 {
331 kmem_cache_t *cp;
332 struct sockparams *origsp;
333
334 /*
335 * If there is a new dealloc function (ie. smod_destroy_func),
336 * then it should check the correctness of the ops.
337 */
338
339 ASSERT(so->so_ops == &sotpi_sonodeops);
340
341 origsp = SOTOTPI(so)->sti_orig_sp;
342
343 sotpi_info_fini(so);
344
345 if (so->so_state & SS_FALLBACK_COMP) {
346 /*
347 * A fallback happend, which means that a sotpi_info_t struct
348 * was allocated (as opposed to being allocated from the TPI
349 * sonode cache. Therefore we explicitly free the struct
350 * here.
351 */
352 sotpi_info_destroy(so);
353 ASSERT(origsp != NULL);
354
355 origsp->sp_smod_info->smod_sock_destroy_func(so);
356 SOCKPARAMS_DEC_REF(origsp);
357 } else {
358 sonode_fini(so);
359 cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
360 socktpi_cache;
361 kmem_cache_free(cp, so);
362 }
363 }
364
365 /* ARGSUSED1 */
366 int
sotpi_init(struct sonode * so,struct sonode * tso,struct cred * cr,int flags)367 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
368 {
369 major_t maj;
370 dev_t newdev;
371 struct vnode *vp;
372 int error = 0;
373 struct stdata *stp;
374
375 sotpi_info_t *sti = SOTOTPI(so);
376
377 dprint(1, ("sotpi_init()\n"));
378
379 /*
380 * over write the sleep flag passed in but that is ok
381 * as tpi socket does not honor sleep flag.
382 */
383 flags |= FREAD|FWRITE;
384
385 /*
386 * Record in so_flag that it is a clone.
387 */
388 if (getmajor(sti->sti_dev) == clone_major)
389 so->so_flag |= SOCLONE;
390
391 if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
392 (so->so_family == AF_INET || so->so_family == AF_INET6) &&
393 (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
394 so->so_protocol == IPPROTO_IP)) {
395 /* Tell tcp or udp that it's talking to sockets */
396 flags |= SO_SOCKSTR;
397
398 /*
399 * Here we indicate to socktpi_open() our attempt to
400 * make direct calls between sockfs and transport.
401 * The final decision is left to socktpi_open().
402 */
403 sti->sti_direct = 1;
404
405 ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
406 if (so->so_type == SOCK_STREAM && tso != NULL) {
407 if (SOTOTPI(tso)->sti_direct) {
408 /*
409 * Inherit sti_direct from listener and pass
410 * SO_ACCEPTOR open flag to tcp, indicating
411 * that this is an accept fast-path instance.
412 */
413 flags |= SO_ACCEPTOR;
414 } else {
415 /*
416 * sti_direct is not set on listener, meaning
417 * that the listener has been converted from
418 * a socket to a stream. Ensure that the
419 * acceptor inherits these settings.
420 */
421 sti->sti_direct = 0;
422 flags &= ~SO_SOCKSTR;
423 }
424 }
425 }
426
427 /*
428 * Tell local transport that it is talking to sockets.
429 */
430 if (so->so_family == AF_UNIX) {
431 flags |= SO_SOCKSTR;
432 }
433
434 vp = SOTOV(so);
435 newdev = vp->v_rdev;
436 maj = getmajor(newdev);
437 ASSERT(STREAMSTAB(maj));
438
439 error = stropen(vp, &newdev, flags, cr);
440
441 stp = vp->v_stream;
442 if (error == 0) {
443 if (so->so_flag & SOCLONE)
444 ASSERT(newdev != vp->v_rdev);
445 mutex_enter(&so->so_lock);
446 sti->sti_dev = newdev;
447 vp->v_rdev = newdev;
448 mutex_exit(&so->so_lock);
449
450 if (stp->sd_flag & STRISTTY) {
451 /*
452 * this is a post SVR4 tty driver - a socket can not
453 * be a controlling terminal. Fail the open.
454 */
455 (void) sotpi_close(so, flags, cr);
456 return (ENOTTY); /* XXX */
457 }
458
459 ASSERT(stp->sd_wrq != NULL);
460 sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
461
462 /*
463 * If caller is interested in doing direct function call
464 * interface to/from transport module, probe the module
465 * directly beneath the streamhead to see if it qualifies.
466 *
467 * We turn off the direct interface when qualifications fail.
468 * In the acceptor case, we simply turn off the sti_direct
469 * flag on the socket. We do the fallback after the accept
470 * has completed, before the new socket is returned to the
471 * application.
472 */
473 if (sti->sti_direct) {
474 queue_t *tq = stp->sd_wrq->q_next;
475
476 /*
477 * sti_direct is currently supported and tested
478 * only for tcp/udp; this is the main reason to
479 * have the following assertions.
480 */
481 ASSERT(so->so_family == AF_INET ||
482 so->so_family == AF_INET6);
483 ASSERT(so->so_protocol == IPPROTO_UDP ||
484 so->so_protocol == IPPROTO_TCP ||
485 so->so_protocol == IPPROTO_IP);
486 ASSERT(so->so_type == SOCK_DGRAM ||
487 so->so_type == SOCK_STREAM);
488
489 /*
490 * Abort direct call interface if the module directly
491 * underneath the stream head is not defined with the
492 * _D_DIRECT flag. This could happen in the tcp or
493 * udp case, when some other module is autopushed
494 * above it, or for some reasons the expected module
495 * isn't purely D_MP (which is the main requirement).
496 */
497 if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
498 !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
499 int rval;
500
501 /* Continue on without direct calls */
502 sti->sti_direct = 0;
503
504 /*
505 * Cannot issue ioctl on fallback socket since
506 * there is no conn associated with the queue.
507 * The fallback downcall will notify the proto
508 * of the change.
509 */
510 if (!(flags & SO_ACCEPTOR) &&
511 !(flags & SO_FALLBACK)) {
512 if ((error = strioctl(vp,
513 _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
514 cr, &rval)) != 0) {
515 (void) sotpi_close(so, flags,
516 cr);
517 return (error);
518 }
519 }
520 }
521 }
522
523 if (flags & SO_FALLBACK) {
524 /*
525 * The stream created does not have a conn.
526 * do stream set up after conn has been assigned
527 */
528 return (error);
529 }
530 if (error = so_strinit(so, tso)) {
531 (void) sotpi_close(so, flags, cr);
532 return (error);
533 }
534
535 /* Wildcard */
536 if (so->so_protocol != so->so_sockparams->sp_protocol) {
537 int protocol = so->so_protocol;
538 /*
539 * Issue SO_PROTOTYPE setsockopt.
540 */
541 error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
542 &protocol, (t_uscalar_t)sizeof (protocol), cr);
543 if (error != 0) {
544 (void) sotpi_close(so, flags, cr);
545 /*
546 * Setsockopt often fails with ENOPROTOOPT but
547 * socket() should fail with
548 * EPROTONOSUPPORT/EPROTOTYPE.
549 */
550 return (EPROTONOSUPPORT);
551 }
552 }
553
554 } else {
555 /*
556 * While the same socket can not be reopened (unlike specfs)
557 * the stream head sets STREOPENFAIL when the autopush fails.
558 */
559 if ((stp != NULL) &&
560 (stp->sd_flag & STREOPENFAIL)) {
561 /*
562 * Open failed part way through.
563 */
564 mutex_enter(&stp->sd_lock);
565 stp->sd_flag &= ~STREOPENFAIL;
566 mutex_exit(&stp->sd_lock);
567 (void) sotpi_close(so, flags, cr);
568 return (error);
569 /*NOTREACHED*/
570 }
571 ASSERT(stp == NULL);
572 }
573 TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
574 "sockfs open:maj %d vp %p so %p error %d",
575 maj, vp, so, error);
576 return (error);
577 }
578
579 /*
580 * Bind the socket to an unspecified address in sockfs only.
581 * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
582 * required in all cases.
583 */
584 static void
so_automatic_bind(struct sonode * so)585 so_automatic_bind(struct sonode *so)
586 {
587 sotpi_info_t *sti = SOTOTPI(so);
588 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
589
590 ASSERT(MUTEX_HELD(&so->so_lock));
591 ASSERT(!(so->so_state & SS_ISBOUND));
592 ASSERT(sti->sti_unbind_mp);
593
594 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
595 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
596 sti->sti_laddr_sa->sa_family = so->so_family;
597 so->so_state |= SS_ISBOUND;
598 }
599
600
601 /*
602 * bind the socket.
603 *
604 * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
605 * are passed in we allow rebinding. Note that for backwards compatibility
606 * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
607 * Thus the rebinding code is currently not executed.
608 *
609 * The constraints for rebinding are:
610 * - it is a SOCK_DGRAM, or
611 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
612 * and no listen() has been done.
613 * This rebinding code was added based on some language in the XNET book
614 * about not returning EINVAL it the protocol allows rebinding. However,
615 * this language is not present in the Posix socket draft. Thus maybe the
616 * rebinding logic should be deleted from the source.
617 *
618 * A null "name" can be used to unbind the socket if:
619 * - it is a SOCK_DGRAM, or
620 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
621 * and no listen() has been done.
622 */
623 /* ARGSUSED */
624 static int
sotpi_bindlisten(struct sonode * so,struct sockaddr * name,socklen_t namelen,int backlog,int flags,struct cred * cr)625 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
626 socklen_t namelen, int backlog, int flags, struct cred *cr)
627 {
628 struct T_bind_req bind_req;
629 struct T_bind_ack *bind_ack;
630 int error = 0;
631 mblk_t *mp;
632 void *addr;
633 t_uscalar_t addrlen;
634 int unbind_on_err = 1;
635 boolean_t clear_acceptconn_on_err = B_FALSE;
636 boolean_t restore_backlog_on_err = B_FALSE;
637 int save_so_backlog;
638 t_scalar_t PRIM_type = O_T_BIND_REQ;
639 boolean_t tcp_udp_xport;
640 void *nl7c = NULL;
641 sotpi_info_t *sti = SOTOTPI(so);
642
643 dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
644 (void *)so, (void *)name, namelen, backlog, flags,
645 pr_state(so->so_state, so->so_mode)));
646
647 tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
648
649 if (!(flags & _SOBIND_LOCK_HELD)) {
650 mutex_enter(&so->so_lock);
651 so_lock_single(so); /* Set SOLOCKED */
652 } else {
653 ASSERT(MUTEX_HELD(&so->so_lock));
654 ASSERT(so->so_flag & SOLOCKED);
655 }
656
657 /*
658 * Make sure that there is a preallocated unbind_req message
659 * before binding. This message allocated when the socket is
660 * created but it might be have been consumed.
661 */
662 if (sti->sti_unbind_mp == NULL) {
663 dprintso(so, 1, ("sobind: allocating unbind_req\n"));
664 /* NOTE: holding so_lock while sleeping */
665 sti->sti_unbind_mp =
666 soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
667 cr);
668 }
669
670 if (flags & _SOBIND_REBIND) {
671 /*
672 * Called from solisten after doing an sotpi_unbind() or
673 * potentially without the unbind (latter for AF_INET{,6}).
674 */
675 ASSERT(name == NULL && namelen == 0);
676
677 if (so->so_family == AF_UNIX) {
678 ASSERT(sti->sti_ux_bound_vp);
679 addr = &sti->sti_ux_laddr;
680 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
681 dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
682 "addr 0x%p, vp %p\n",
683 addrlen,
684 (void *)((struct so_ux_addr *)addr)->soua_vp,
685 (void *)sti->sti_ux_bound_vp));
686 } else {
687 addr = sti->sti_laddr_sa;
688 addrlen = (t_uscalar_t)sti->sti_laddr_len;
689 }
690 } else if (flags & _SOBIND_UNSPEC) {
691 ASSERT(name == NULL && namelen == 0);
692
693 /*
694 * The caller checked SS_ISBOUND but not necessarily
695 * under so_lock
696 */
697 if (so->so_state & SS_ISBOUND) {
698 /* No error */
699 goto done;
700 }
701
702 /* Set an initial local address */
703 switch (so->so_family) {
704 case AF_UNIX:
705 /*
706 * Use an address with same size as struct sockaddr
707 * just like BSD.
708 */
709 sti->sti_laddr_len =
710 (socklen_t)sizeof (struct sockaddr);
711 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
712 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
713 sti->sti_laddr_sa->sa_family = so->so_family;
714
715 /*
716 * Pass down an address with the implicit bind
717 * magic number and the rest all zeros.
718 * The transport will return a unique address.
719 */
720 sti->sti_ux_laddr.soua_vp = NULL;
721 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
722 addr = &sti->sti_ux_laddr;
723 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
724 break;
725
726 case AF_INET:
727 case AF_INET6:
728 /*
729 * An unspecified bind in TPI has a NULL address.
730 * Set the address in sockfs to have the sa_family.
731 */
732 sti->sti_laddr_len = (so->so_family == AF_INET) ?
733 (socklen_t)sizeof (sin_t) :
734 (socklen_t)sizeof (sin6_t);
735 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
736 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
737 sti->sti_laddr_sa->sa_family = so->so_family;
738 addr = NULL;
739 addrlen = 0;
740 break;
741
742 default:
743 /*
744 * An unspecified bind in TPI has a NULL address.
745 * Set the address in sockfs to be zero length.
746 *
747 * Can not assume there is a sa_family for all
748 * protocol families. For example, AF_X25 does not
749 * have a family field.
750 */
751 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
752 sti->sti_laddr_len = 0; /* XXX correct? */
753 addr = NULL;
754 addrlen = 0;
755 break;
756 }
757
758 } else {
759 if (so->so_state & SS_ISBOUND) {
760 /*
761 * If it is ok to rebind the socket, first unbind
762 * with the transport. A rebind to the NULL address
763 * is interpreted as an unbind.
764 * Note that a bind to NULL in BSD does unbind the
765 * socket but it fails with EINVAL.
766 * Note that regular sockets set SOV_SOCKBSD i.e.
767 * _SOBIND_SOCKBSD gets set here hence no type of
768 * socket does currently allow rebinding.
769 *
770 * If the name is NULL just do an unbind.
771 */
772 if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
773 name != NULL) {
774 error = EINVAL;
775 unbind_on_err = 0;
776 eprintsoline(so, error);
777 goto done;
778 }
779 if ((so->so_mode & SM_CONNREQUIRED) &&
780 (so->so_state & SS_CANTREBIND)) {
781 error = EINVAL;
782 unbind_on_err = 0;
783 eprintsoline(so, error);
784 goto done;
785 }
786 error = sotpi_unbind(so, 0);
787 if (error) {
788 eprintsoline(so, error);
789 goto done;
790 }
791 ASSERT(!(so->so_state & SS_ISBOUND));
792 if (name == NULL) {
793 so->so_state &=
794 ~(SS_ISCONNECTED|SS_ISCONNECTING);
795 goto done;
796 }
797 }
798
799 /* X/Open requires this check */
800 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
801 if (xnet_check_print) {
802 printf("sockfs: X/Open bind state check "
803 "caused EINVAL\n");
804 }
805 error = EINVAL;
806 goto done;
807 }
808
809 switch (so->so_family) {
810 case AF_UNIX:
811 /*
812 * All AF_UNIX addresses are nul terminated
813 * when copied (copyin_name) in so the minimum
814 * length is 3 bytes.
815 */
816 if (name == NULL ||
817 (ssize_t)namelen <= sizeof (short) + 1) {
818 error = EISDIR;
819 eprintsoline(so, error);
820 goto done;
821 }
822 /*
823 * Verify so_family matches the bound family.
824 * BSD does not check this for AF_UNIX resulting
825 * in funny mknods.
826 */
827 if (name->sa_family != so->so_family) {
828 error = EAFNOSUPPORT;
829 goto done;
830 }
831 break;
832 case AF_INET:
833 if (name == NULL) {
834 error = EINVAL;
835 eprintsoline(so, error);
836 goto done;
837 }
838 if ((size_t)namelen != sizeof (sin_t)) {
839 error = name->sa_family != so->so_family ?
840 EAFNOSUPPORT : EINVAL;
841 eprintsoline(so, error);
842 goto done;
843 }
844 if ((flags & _SOBIND_XPG4_2) &&
845 (name->sa_family != so->so_family)) {
846 /*
847 * This check has to be made for X/Open
848 * sockets however application failures have
849 * been observed when it is applied to
850 * all sockets.
851 */
852 error = EAFNOSUPPORT;
853 eprintsoline(so, error);
854 goto done;
855 }
856 /*
857 * Force a zero sa_family to match so_family.
858 *
859 * Some programs like inetd(1M) don't set the
860 * family field. Other programs leave
861 * sin_family set to garbage - SunOS 4.X does
862 * not check the family field on a bind.
863 * We use the family field that
864 * was passed in to the socket() call.
865 */
866 name->sa_family = so->so_family;
867 break;
868
869 case AF_INET6: {
870 #ifdef DEBUG
871 sin6_t *sin6 = (sin6_t *)name;
872 #endif /* DEBUG */
873
874 if (name == NULL) {
875 error = EINVAL;
876 eprintsoline(so, error);
877 goto done;
878 }
879 if ((size_t)namelen != sizeof (sin6_t)) {
880 error = name->sa_family != so->so_family ?
881 EAFNOSUPPORT : EINVAL;
882 eprintsoline(so, error);
883 goto done;
884 }
885 if (name->sa_family != so->so_family) {
886 /*
887 * With IPv6 we require the family to match
888 * unlike in IPv4.
889 */
890 error = EAFNOSUPPORT;
891 eprintsoline(so, error);
892 goto done;
893 }
894 #ifdef DEBUG
895 /*
896 * Verify that apps don't forget to clear
897 * sin6_scope_id etc
898 */
899 if (sin6->sin6_scope_id != 0 &&
900 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
901 zcmn_err(getzoneid(), CE_WARN,
902 "bind with uninitialized sin6_scope_id "
903 "(%d) on socket. Pid = %d\n",
904 (int)sin6->sin6_scope_id,
905 (int)curproc->p_pid);
906 }
907 if (sin6->__sin6_src_id != 0) {
908 zcmn_err(getzoneid(), CE_WARN,
909 "bind with uninitialized __sin6_src_id "
910 "(%d) on socket. Pid = %d\n",
911 (int)sin6->__sin6_src_id,
912 (int)curproc->p_pid);
913 }
914 #endif /* DEBUG */
915 break;
916 }
917 default:
918 /*
919 * Don't do any length or sa_family check to allow
920 * non-sockaddr style addresses.
921 */
922 if (name == NULL) {
923 error = EINVAL;
924 eprintsoline(so, error);
925 goto done;
926 }
927 break;
928 }
929
930 if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
931 error = ENAMETOOLONG;
932 eprintsoline(so, error);
933 goto done;
934 }
935 /*
936 * Save local address.
937 */
938 sti->sti_laddr_len = (socklen_t)namelen;
939 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
940 bcopy(name, sti->sti_laddr_sa, namelen);
941
942 addr = sti->sti_laddr_sa;
943 addrlen = (t_uscalar_t)sti->sti_laddr_len;
944 switch (so->so_family) {
945 case AF_INET6:
946 case AF_INET:
947 break;
948 case AF_UNIX: {
949 struct sockaddr_un *soun =
950 (struct sockaddr_un *)sti->sti_laddr_sa;
951 struct vnode *vp, *rvp;
952 struct vattr vattr;
953
954 ASSERT(sti->sti_ux_bound_vp == NULL);
955 /*
956 * Create vnode for the specified path name.
957 * Keep vnode held with a reference in sti_ux_bound_vp.
958 * Use the vnode pointer as the address used in the
959 * bind with the transport.
960 *
961 * Use the same mode as in BSD. In particular this does
962 * not observe the umask.
963 */
964 /* MAXPATHLEN + soun_family + nul termination */
965 if (sti->sti_laddr_len >
966 (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
967 error = ENAMETOOLONG;
968 eprintsoline(so, error);
969 goto done;
970 }
971 vattr.va_type = VSOCK;
972 vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
973 vattr.va_mask = AT_TYPE|AT_MODE;
974 /* NOTE: holding so_lock */
975 error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
976 EXCL, 0, &vp, CRMKNOD, 0, 0);
977 if (error) {
978 if (error == EEXIST)
979 error = EADDRINUSE;
980 eprintsoline(so, error);
981 goto done;
982 }
983 /*
984 * Establish pointer from the underlying filesystem
985 * vnode to the socket node.
986 * sti_ux_bound_vp and v_stream->sd_vnode form the
987 * cross-linkage between the underlying filesystem
988 * node and the socket node.
989 */
990
991 if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
992 VN_HOLD(rvp);
993 VN_RELE(vp);
994 vp = rvp;
995 }
996
997 ASSERT(SOTOV(so)->v_stream);
998 mutex_enter(&vp->v_lock);
999 vp->v_stream = SOTOV(so)->v_stream;
1000 sti->sti_ux_bound_vp = vp;
1001 mutex_exit(&vp->v_lock);
1002
1003 /*
1004 * Use the vnode pointer value as a unique address
1005 * (together with the magic number to avoid conflicts
1006 * with implicit binds) in the transport provider.
1007 */
1008 sti->sti_ux_laddr.soua_vp =
1009 (void *)sti->sti_ux_bound_vp;
1010 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1011 addr = &sti->sti_ux_laddr;
1012 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1013 dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1014 addrlen,
1015 (void *)((struct so_ux_addr *)addr)->soua_vp));
1016 break;
1017 }
1018 } /* end switch (so->so_family) */
1019 }
1020
1021 /*
1022 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1023 * the transport can start passing up T_CONN_IND messages
1024 * as soon as it receives the bind req and strsock_proto()
1025 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1026 */
1027 if (flags & _SOBIND_LISTEN) {
1028 if ((so->so_state & SS_ACCEPTCONN) == 0)
1029 clear_acceptconn_on_err = B_TRUE;
1030 save_so_backlog = so->so_backlog;
1031 restore_backlog_on_err = B_TRUE;
1032 so->so_state |= SS_ACCEPTCONN;
1033 so->so_backlog = backlog;
1034 }
1035
1036 /*
1037 * If NL7C addr(s) have been configured check for addr/port match,
1038 * or if an implicit NL7C socket via AF_NCA mark socket as NL7C.
1039 *
1040 * NL7C supports the TCP transport only so check AF_INET and AF_INET6
1041 * family sockets only. If match mark as such.
1042 */
1043 if (nl7c_enabled && ((addr != NULL &&
1044 (so->so_family == AF_INET || so->so_family == AF_INET6) &&
1045 (nl7c = nl7c_lookup_addr(addr, addrlen))) ||
1046 sti->sti_nl7c_flags == NL7C_AF_NCA)) {
1047 /*
1048 * NL7C is not supported in non-global zones,
1049 * we enforce this restriction here.
1050 */
1051 if (so->so_zoneid == GLOBAL_ZONEID) {
1052 /* An NL7C socket, mark it */
1053 sti->sti_nl7c_flags |= NL7C_ENABLED;
1054 if (nl7c == NULL) {
1055 /*
1056 * Was an AF_NCA bind() so add it to the
1057 * addr list for reporting purposes.
1058 */
1059 nl7c = nl7c_add_addr(addr, addrlen);
1060 }
1061 } else
1062 nl7c = NULL;
1063 }
1064
1065 /*
1066 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1067 * for other transports we will send in a O_T_BIND_REQ.
1068 */
1069 if (tcp_udp_xport &&
1070 (so->so_family == AF_INET || so->so_family == AF_INET6))
1071 PRIM_type = T_BIND_REQ;
1072
1073 bind_req.PRIM_type = PRIM_type;
1074 bind_req.ADDR_length = addrlen;
1075 bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1076 bind_req.CONIND_number = backlog;
1077 /* NOTE: holding so_lock while sleeping */
1078 mp = soallocproto2(&bind_req, sizeof (bind_req),
1079 addr, addrlen, 0, _ALLOC_SLEEP, cr);
1080 sti->sti_laddr_valid = 0;
1081
1082 /* Done using sti_laddr_sa - can drop the lock */
1083 mutex_exit(&so->so_lock);
1084
1085 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1086 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1087 if (error) {
1088 eprintsoline(so, error);
1089 mutex_enter(&so->so_lock);
1090 goto done;
1091 }
1092
1093 mutex_enter(&so->so_lock);
1094 error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1095 (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1096 if (error) {
1097 eprintsoline(so, error);
1098 goto done;
1099 }
1100 ASSERT(mp);
1101 /*
1102 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1103 * strsock_proto while the lock was dropped above, the bind
1104 * is allowed to complete.
1105 */
1106
1107 /* Mark as bound. This will be undone if we detect errors below. */
1108 if (flags & _SOBIND_NOXLATE) {
1109 ASSERT(so->so_family == AF_UNIX);
1110 sti->sti_faddr_noxlate = 1;
1111 }
1112 ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1113 so->so_state |= SS_ISBOUND;
1114 ASSERT(sti->sti_unbind_mp);
1115
1116 /* note that we've already set SS_ACCEPTCONN above */
1117
1118 /*
1119 * Recompute addrlen - an unspecied bind sent down an
1120 * address of length zero but we expect the appropriate length
1121 * in return.
1122 */
1123 addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1124 sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1125
1126 bind_ack = (struct T_bind_ack *)mp->b_rptr;
1127 /*
1128 * The alignment restriction is really too strict but
1129 * we want enough alignment to inspect the fields of
1130 * a sockaddr_in.
1131 */
1132 addr = sogetoff(mp, bind_ack->ADDR_offset,
1133 bind_ack->ADDR_length,
1134 __TPI_ALIGN_SIZE);
1135 if (addr == NULL) {
1136 freemsg(mp);
1137 error = EPROTO;
1138 eprintsoline(so, error);
1139 goto done;
1140 }
1141 if (!(flags & _SOBIND_UNSPEC)) {
1142 /*
1143 * Verify that the transport didn't return something we
1144 * did not want e.g. an address other than what we asked for.
1145 *
1146 * NOTE: These checks would go away if/when we switch to
1147 * using the new TPI (in which the transport would fail
1148 * the request instead of assigning a different address).
1149 *
1150 * NOTE2: For protocols that we don't know (i.e. any
1151 * other than AF_INET6, AF_INET and AF_UNIX), we
1152 * cannot know if the transport should be expected to
1153 * return the same address as that requested.
1154 *
1155 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1156 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1157 *
1158 * For example, in the case of netatalk it may be
1159 * inappropriate for the transport to return the
1160 * requested address (as it may have allocated a local
1161 * port number in behaviour similar to that of an
1162 * AF_INET bind request with a port number of zero).
1163 *
1164 * Given the definition of O_T_BIND_REQ, where the
1165 * transport may bind to an address other than the
1166 * requested address, it's not possible to determine
1167 * whether a returned address that differs from the
1168 * requested address is a reason to fail (because the
1169 * requested address was not available) or succeed
1170 * (because the transport allocated an appropriate
1171 * address and/or port).
1172 *
1173 * sockfs currently requires that the transport return
1174 * the requested address in the T_BIND_ACK, unless
1175 * there is code here to allow for any discrepancy.
1176 * Such code exists for AF_INET and AF_INET6.
1177 *
1178 * Netatalk chooses to return the requested address
1179 * rather than the (correct) allocated address. This
1180 * means that netatalk violates the TPI specification
1181 * (and would not function correctly if used from a
1182 * TLI application), but it does mean that it works
1183 * with sockfs.
1184 *
1185 * As noted above, using the newer XTI bind primitive
1186 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1187 * allow sockfs to be more sure about whether or not
1188 * the bind request had succeeded (as transports are
1189 * not permitted to bind to a different address than
1190 * that requested - they must return failure).
1191 * Unfortunately, support for T_BIND_REQ may not be
1192 * present in all transport implementations (netatalk,
1193 * for example, doesn't have it), making the
1194 * transition difficult.
1195 */
1196 if (bind_ack->ADDR_length != addrlen) {
1197 /* Assumes that the requested address was in use */
1198 freemsg(mp);
1199 error = EADDRINUSE;
1200 eprintsoline(so, error);
1201 goto done;
1202 }
1203
1204 switch (so->so_family) {
1205 case AF_INET6:
1206 case AF_INET: {
1207 sin_t *rname, *aname;
1208
1209 rname = (sin_t *)addr;
1210 aname = (sin_t *)sti->sti_laddr_sa;
1211
1212 /*
1213 * Take advantage of the alignment
1214 * of sin_port and sin6_port which fall
1215 * in the same place in their data structures.
1216 * Just use sin_port for either address family.
1217 *
1218 * This may become a problem if (heaven forbid)
1219 * there's a separate ipv6port_reserved... :-P
1220 *
1221 * Binding to port 0 has the semantics of letting
1222 * the transport bind to any port.
1223 *
1224 * If the transport is TCP or UDP since we had sent
1225 * a T_BIND_REQ we would not get a port other than
1226 * what we asked for.
1227 */
1228 if (tcp_udp_xport) {
1229 /*
1230 * Pick up the new port number if we bound to
1231 * port 0.
1232 */
1233 if (aname->sin_port == 0)
1234 aname->sin_port = rname->sin_port;
1235 sti->sti_laddr_valid = 1;
1236 break;
1237 }
1238 if (aname->sin_port != 0 &&
1239 aname->sin_port != rname->sin_port) {
1240 freemsg(mp);
1241 error = EADDRINUSE;
1242 eprintsoline(so, error);
1243 goto done;
1244 }
1245 /*
1246 * Pick up the new port number if we bound to port 0.
1247 */
1248 aname->sin_port = rname->sin_port;
1249
1250 /*
1251 * Unfortunately, addresses aren't _quite_ the same.
1252 */
1253 if (so->so_family == AF_INET) {
1254 if (aname->sin_addr.s_addr !=
1255 rname->sin_addr.s_addr) {
1256 freemsg(mp);
1257 error = EADDRNOTAVAIL;
1258 eprintsoline(so, error);
1259 goto done;
1260 }
1261 } else {
1262 sin6_t *rname6 = (sin6_t *)rname;
1263 sin6_t *aname6 = (sin6_t *)aname;
1264
1265 if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1266 &rname6->sin6_addr)) {
1267 freemsg(mp);
1268 error = EADDRNOTAVAIL;
1269 eprintsoline(so, error);
1270 goto done;
1271 }
1272 }
1273 break;
1274 }
1275 case AF_UNIX:
1276 if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1277 freemsg(mp);
1278 error = EADDRINUSE;
1279 eprintsoline(so, error);
1280 eprintso(so,
1281 ("addrlen %d, addr 0x%x, vp %p\n",
1282 addrlen, *((int *)addr),
1283 (void *)sti->sti_ux_bound_vp));
1284 goto done;
1285 }
1286 sti->sti_laddr_valid = 1;
1287 break;
1288 default:
1289 /*
1290 * NOTE: This assumes that addresses can be
1291 * byte-compared for equivalence.
1292 */
1293 if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1294 freemsg(mp);
1295 error = EADDRINUSE;
1296 eprintsoline(so, error);
1297 goto done;
1298 }
1299 /*
1300 * Don't mark sti_laddr_valid, as we cannot be
1301 * sure that the returned address is the real
1302 * bound address when talking to an unknown
1303 * transport.
1304 */
1305 break;
1306 }
1307 } else {
1308 /*
1309 * Save for returned address for getsockname.
1310 * Needed for unspecific bind unless transport supports
1311 * the TI_GETMYNAME ioctl.
1312 * Do this for AF_INET{,6} even though they do, as
1313 * caching info here is much better performance than
1314 * a TPI/STREAMS trip to the transport for getsockname.
1315 * Any which can't for some reason _must_ _not_ set
1316 * sti_laddr_valid here for the caching version of
1317 * getsockname to not break;
1318 */
1319 switch (so->so_family) {
1320 case AF_UNIX:
1321 /*
1322 * Record the address bound with the transport
1323 * for use by socketpair.
1324 */
1325 bcopy(addr, &sti->sti_ux_laddr, addrlen);
1326 sti->sti_laddr_valid = 1;
1327 break;
1328 case AF_INET:
1329 case AF_INET6:
1330 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1331 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1332 sti->sti_laddr_valid = 1;
1333 break;
1334 default:
1335 /*
1336 * Don't mark sti_laddr_valid, as we cannot be
1337 * sure that the returned address is the real
1338 * bound address when talking to an unknown
1339 * transport.
1340 */
1341 break;
1342 }
1343 }
1344
1345 if (nl7c != NULL) {
1346 /* Register listen()er sonode pointer with NL7C */
1347 nl7c_listener_addr(nl7c, so);
1348 }
1349
1350 freemsg(mp);
1351
1352 done:
1353 if (error) {
1354 /* reset state & backlog to values held on entry */
1355 if (clear_acceptconn_on_err == B_TRUE)
1356 so->so_state &= ~SS_ACCEPTCONN;
1357 if (restore_backlog_on_err == B_TRUE)
1358 so->so_backlog = save_so_backlog;
1359
1360 if (unbind_on_err && so->so_state & SS_ISBOUND) {
1361 int err;
1362
1363 err = sotpi_unbind(so, 0);
1364 /* LINTED - statement has no consequent: if */
1365 if (err) {
1366 eprintsoline(so, error);
1367 } else {
1368 ASSERT(!(so->so_state & SS_ISBOUND));
1369 }
1370 }
1371 }
1372 if (!(flags & _SOBIND_LOCK_HELD)) {
1373 so_unlock_single(so, SOLOCKED);
1374 mutex_exit(&so->so_lock);
1375 } else {
1376 ASSERT(MUTEX_HELD(&so->so_lock));
1377 ASSERT(so->so_flag & SOLOCKED);
1378 }
1379 return (error);
1380 }
1381
1382 /* bind the socket */
1383 static int
sotpi_bind(struct sonode * so,struct sockaddr * name,socklen_t namelen,int flags,struct cred * cr)1384 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1385 int flags, struct cred *cr)
1386 {
1387 if ((flags & _SOBIND_SOCKETPAIR) == 0)
1388 return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1389
1390 flags &= ~_SOBIND_SOCKETPAIR;
1391 return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1392 }
1393
1394 /*
1395 * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1396 * address, or when listen needs to unbind and bind.
1397 * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1398 * so that a sobind can pick them up.
1399 */
1400 static int
sotpi_unbind(struct sonode * so,int flags)1401 sotpi_unbind(struct sonode *so, int flags)
1402 {
1403 struct T_unbind_req unbind_req;
1404 int error = 0;
1405 mblk_t *mp;
1406 sotpi_info_t *sti = SOTOTPI(so);
1407
1408 dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1409 (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1410
1411 ASSERT(MUTEX_HELD(&so->so_lock));
1412 ASSERT(so->so_flag & SOLOCKED);
1413
1414 if (!(so->so_state & SS_ISBOUND)) {
1415 error = EINVAL;
1416 eprintsoline(so, error);
1417 goto done;
1418 }
1419
1420 mutex_exit(&so->so_lock);
1421
1422 /*
1423 * Flush the read and write side (except stream head read queue)
1424 * and send down T_UNBIND_REQ.
1425 */
1426 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1427
1428 unbind_req.PRIM_type = T_UNBIND_REQ;
1429 mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1430 0, _ALLOC_SLEEP, CRED());
1431 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1432 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1433 mutex_enter(&so->so_lock);
1434 if (error) {
1435 eprintsoline(so, error);
1436 goto done;
1437 }
1438
1439 error = sowaitokack(so, T_UNBIND_REQ);
1440 if (error) {
1441 eprintsoline(so, error);
1442 goto done;
1443 }
1444
1445 /*
1446 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1447 * strsock_proto while the lock was dropped above, the unbind
1448 * is allowed to complete.
1449 */
1450 if (!(flags & _SOUNBIND_REBIND)) {
1451 /*
1452 * Clear out bound address.
1453 */
1454 vnode_t *vp;
1455
1456 if ((vp = sti->sti_ux_bound_vp) != NULL) {
1457 sti->sti_ux_bound_vp = NULL;
1458 vn_rele_stream(vp);
1459 }
1460 /* Clear out address */
1461 sti->sti_laddr_len = 0;
1462 }
1463 so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1464 sti->sti_laddr_valid = 0;
1465
1466 done:
1467
1468 /* If the caller held the lock don't release it here */
1469 ASSERT(MUTEX_HELD(&so->so_lock));
1470 ASSERT(so->so_flag & SOLOCKED);
1471
1472 return (error);
1473 }
1474
1475 /*
1476 * listen on the socket.
1477 * For TPI conforming transports this has to first unbind with the transport
1478 * and then bind again using the new backlog.
1479 */
1480 /* ARGSUSED */
1481 int
sotpi_listen(struct sonode * so,int backlog,struct cred * cr)1482 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1483 {
1484 int error = 0;
1485 sotpi_info_t *sti = SOTOTPI(so);
1486
1487 dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1488 (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1489
1490 if (sti->sti_serv_type == T_CLTS)
1491 return (EOPNOTSUPP);
1492
1493 /*
1494 * If the socket is ready to accept connections already, then
1495 * return without doing anything. This avoids a problem where
1496 * a second listen() call fails if a connection is pending and
1497 * leaves the socket unbound. Only when we are not unbinding
1498 * with the transport can we safely increase the backlog.
1499 */
1500 if (so->so_state & SS_ACCEPTCONN &&
1501 !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1502 /*CONSTCOND*/
1503 !solisten_tpi_tcp))
1504 return (0);
1505
1506 if (so->so_state & SS_ISCONNECTED)
1507 return (EINVAL);
1508
1509 mutex_enter(&so->so_lock);
1510 so_lock_single(so); /* Set SOLOCKED */
1511
1512 /*
1513 * If the listen doesn't change the backlog we do nothing.
1514 * This avoids an EPROTO error from the transport.
1515 */
1516 if ((so->so_state & SS_ACCEPTCONN) &&
1517 so->so_backlog == backlog)
1518 goto done;
1519
1520 if (!(so->so_state & SS_ISBOUND)) {
1521 /*
1522 * Must have been explicitly bound in the UNIX domain.
1523 */
1524 if (so->so_family == AF_UNIX) {
1525 error = EINVAL;
1526 goto done;
1527 }
1528 error = sotpi_bindlisten(so, NULL, 0, backlog,
1529 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1530 } else if (backlog > 0) {
1531 /*
1532 * AF_INET{,6} hack to avoid losing the port.
1533 * Assumes that all AF_INET{,6} transports can handle a
1534 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1535 * has already bound thus it is possible to avoid the unbind.
1536 */
1537 if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1538 /*CONSTCOND*/
1539 !solisten_tpi_tcp)) {
1540 error = sotpi_unbind(so, _SOUNBIND_REBIND);
1541 if (error)
1542 goto done;
1543 }
1544 error = sotpi_bindlisten(so, NULL, 0, backlog,
1545 _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1546 } else {
1547 so->so_state |= SS_ACCEPTCONN;
1548 so->so_backlog = backlog;
1549 }
1550 if (error)
1551 goto done;
1552 ASSERT(so->so_state & SS_ACCEPTCONN);
1553 done:
1554 so_unlock_single(so, SOLOCKED);
1555 mutex_exit(&so->so_lock);
1556 return (error);
1557 }
1558
1559 /*
1560 * Disconnect either a specified seqno or all (-1).
1561 * The former is used on listening sockets only.
1562 *
1563 * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1564 * the current use of sodisconnect(seqno == -1) is only for shutdown
1565 * so there is no point (and potentially incorrect) to unbind.
1566 */
1567 static int
sodisconnect(struct sonode * so,t_scalar_t seqno,int flags)1568 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1569 {
1570 struct T_discon_req discon_req;
1571 int error = 0;
1572 mblk_t *mp;
1573
1574 dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1575 (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1576
1577 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1578 mutex_enter(&so->so_lock);
1579 so_lock_single(so); /* Set SOLOCKED */
1580 } else {
1581 ASSERT(MUTEX_HELD(&so->so_lock));
1582 ASSERT(so->so_flag & SOLOCKED);
1583 }
1584
1585 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1586 error = EINVAL;
1587 eprintsoline(so, error);
1588 goto done;
1589 }
1590
1591 mutex_exit(&so->so_lock);
1592 /*
1593 * Flush the write side (unless this is a listener)
1594 * and then send down a T_DISCON_REQ.
1595 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1596 * and other messages.)
1597 */
1598 if (!(so->so_state & SS_ACCEPTCONN))
1599 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1600
1601 discon_req.PRIM_type = T_DISCON_REQ;
1602 discon_req.SEQ_number = seqno;
1603 mp = soallocproto1(&discon_req, sizeof (discon_req),
1604 0, _ALLOC_SLEEP, CRED());
1605 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1606 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1607 mutex_enter(&so->so_lock);
1608 if (error) {
1609 eprintsoline(so, error);
1610 goto done;
1611 }
1612
1613 error = sowaitokack(so, T_DISCON_REQ);
1614 if (error) {
1615 eprintsoline(so, error);
1616 goto done;
1617 }
1618 /*
1619 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1620 * strsock_proto while the lock was dropped above, the disconnect
1621 * is allowed to complete. However, it is not possible to
1622 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1623 */
1624 so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1625 SOTOTPI(so)->sti_laddr_valid = 0;
1626 SOTOTPI(so)->sti_faddr_valid = 0;
1627 done:
1628 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1629 so_unlock_single(so, SOLOCKED);
1630 mutex_exit(&so->so_lock);
1631 } else {
1632 /* If the caller held the lock don't release it here */
1633 ASSERT(MUTEX_HELD(&so->so_lock));
1634 ASSERT(so->so_flag & SOLOCKED);
1635 }
1636 return (error);
1637 }
1638
1639 /* ARGSUSED */
1640 int
sotpi_accept(struct sonode * so,int fflag,struct cred * cr,struct sonode ** nsop)1641 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1642 struct sonode **nsop)
1643 {
1644 struct T_conn_ind *conn_ind;
1645 struct T_conn_res *conn_res;
1646 int error = 0;
1647 mblk_t *mp, *ack_mp;
1648 struct sonode *nso;
1649 vnode_t *nvp;
1650 void *src;
1651 t_uscalar_t srclen;
1652 void *opt;
1653 t_uscalar_t optlen;
1654 t_scalar_t PRIM_type;
1655 t_scalar_t SEQ_number;
1656 size_t sinlen;
1657 sotpi_info_t *sti = SOTOTPI(so);
1658 sotpi_info_t *nsti;
1659
1660 dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1661 (void *)so, fflag, (void *)nsop,
1662 pr_state(so->so_state, so->so_mode)));
1663
1664 /*
1665 * Defer single-threading the accepting socket until
1666 * the T_CONN_IND has been received and parsed and the
1667 * new sonode has been opened.
1668 */
1669
1670 /* Check that we are not already connected */
1671 if ((so->so_state & SS_ACCEPTCONN) == 0)
1672 goto conn_bad;
1673 again:
1674 if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1675 goto e_bad;
1676
1677 ASSERT(mp != NULL);
1678 conn_ind = (struct T_conn_ind *)mp->b_rptr;
1679
1680 /*
1681 * Save SEQ_number for error paths.
1682 */
1683 SEQ_number = conn_ind->SEQ_number;
1684
1685 srclen = conn_ind->SRC_length;
1686 src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1687 if (src == NULL) {
1688 error = EPROTO;
1689 freemsg(mp);
1690 eprintsoline(so, error);
1691 goto disconnect_unlocked;
1692 }
1693 optlen = conn_ind->OPT_length;
1694 switch (so->so_family) {
1695 case AF_INET:
1696 case AF_INET6:
1697 if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1698 bcopy(mp->b_rptr + conn_ind->OPT_offset,
1699 &opt, conn_ind->OPT_length);
1700 } else {
1701 /*
1702 * The transport (in this case TCP) hasn't sent up
1703 * a pointer to an instance for the accept fast-path.
1704 * Disable fast-path completely because the call to
1705 * sotpi_create() below would otherwise create an
1706 * incomplete TCP instance, which would lead to
1707 * problems when sockfs sends a normal T_CONN_RES
1708 * message down the new stream.
1709 */
1710 if (sti->sti_direct) {
1711 int rval;
1712 /*
1713 * For consistency we inform tcp to disable
1714 * direct interface on the listener, though
1715 * we can certainly live without doing this
1716 * because no data will ever travel upstream
1717 * on the listening socket.
1718 */
1719 sti->sti_direct = 0;
1720 (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1721 0, 0, K_TO_K, cr, &rval);
1722 }
1723 opt = NULL;
1724 optlen = 0;
1725 }
1726 break;
1727 case AF_UNIX:
1728 default:
1729 if (optlen != 0) {
1730 opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1731 __TPI_ALIGN_SIZE);
1732 if (opt == NULL) {
1733 error = EPROTO;
1734 freemsg(mp);
1735 eprintsoline(so, error);
1736 goto disconnect_unlocked;
1737 }
1738 }
1739 if (so->so_family == AF_UNIX) {
1740 if (!sti->sti_faddr_noxlate) {
1741 src = NULL;
1742 srclen = 0;
1743 }
1744 /* Extract src address from options */
1745 if (optlen != 0)
1746 so_getopt_srcaddr(opt, optlen, &src, &srclen);
1747 }
1748 break;
1749 }
1750
1751 /*
1752 * Create the new socket.
1753 */
1754 nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1755 if (nso == NULL) {
1756 ASSERT(error != 0);
1757 /*
1758 * Accept can not fail with ENOBUFS. sotpi_create
1759 * sleeps waiting for memory until a signal is caught
1760 * so return EINTR.
1761 */
1762 freemsg(mp);
1763 if (error == ENOBUFS)
1764 error = EINTR;
1765 goto e_disc_unl;
1766 }
1767 nvp = SOTOV(nso);
1768 nsti = SOTOTPI(nso);
1769
1770 #ifdef DEBUG
1771 /*
1772 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1773 * it's inherited early to allow debugging of the accept code itself.
1774 */
1775 nso->so_options |= so->so_options & SO_DEBUG;
1776 #endif /* DEBUG */
1777
1778 /*
1779 * Save the SRC address from the T_CONN_IND
1780 * for getpeername to work on AF_UNIX and on transports that do not
1781 * support TI_GETPEERNAME.
1782 *
1783 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1784 * copyin_name().
1785 */
1786 if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1787 error = EINVAL;
1788 freemsg(mp);
1789 eprintsoline(so, error);
1790 goto disconnect_vp_unlocked;
1791 }
1792 nsti->sti_faddr_len = (socklen_t)srclen;
1793 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1794 bcopy(src, nsti->sti_faddr_sa, srclen);
1795 nsti->sti_faddr_valid = 1;
1796
1797 /*
1798 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1799 */
1800 if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1801 (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1802 cred_t *cr;
1803 pid_t cpid;
1804
1805 cr = msg_getcred(mp, &cpid);
1806 if (cr != NULL) {
1807 crhold(cr);
1808 nso->so_peercred = cr;
1809 nso->so_cpid = cpid;
1810 }
1811 freemsg(mp);
1812
1813 mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1814 sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1815 if (mp == NULL) {
1816 /*
1817 * Accept can not fail with ENOBUFS.
1818 * A signal was caught so return EINTR.
1819 */
1820 error = EINTR;
1821 eprintsoline(so, error);
1822 goto disconnect_vp_unlocked;
1823 }
1824 conn_res = (struct T_conn_res *)mp->b_rptr;
1825 } else {
1826 /*
1827 * For efficency reasons we use msg_extractcred; no crhold
1828 * needed since db_credp is cleared (i.e., we move the cred
1829 * from the message to so_peercred.
1830 */
1831 nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1832
1833 mp->b_rptr = DB_BASE(mp);
1834 conn_res = (struct T_conn_res *)mp->b_rptr;
1835 mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1836
1837 mblk_setcred(mp, cr, curproc->p_pid);
1838 }
1839
1840 /*
1841 * New socket must be bound at least in sockfs and, except for AF_INET,
1842 * (or AF_INET6) it also has to be bound in the transport provider.
1843 * We set the local address in the sonode from the T_OK_ACK of the
1844 * T_CONN_RES. For this reason the address we bind to here isn't
1845 * important.
1846 */
1847 if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1848 /*CONSTCOND*/
1849 nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1850 /*
1851 * Optimization for AF_INET{,6} transports
1852 * that can handle a T_CONN_RES without being bound.
1853 */
1854 mutex_enter(&nso->so_lock);
1855 so_automatic_bind(nso);
1856 mutex_exit(&nso->so_lock);
1857 } else {
1858 /* Perform NULL bind with the transport provider. */
1859 if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1860 cr)) != 0) {
1861 ASSERT(error != ENOBUFS);
1862 freemsg(mp);
1863 eprintsoline(nso, error);
1864 goto disconnect_vp_unlocked;
1865 }
1866 }
1867
1868 /*
1869 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1870 * so that any data arriving on the new socket will cause the
1871 * appropriate signals to be delivered for the new socket.
1872 *
1873 * No other thread (except strsock_proto and strsock_misc)
1874 * can access the new socket thus we relax the locking.
1875 */
1876 nso->so_pgrp = so->so_pgrp;
1877 nso->so_state |= so->so_state & SS_ASYNC;
1878 nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1879
1880 if (nso->so_pgrp != 0) {
1881 if ((error = so_set_events(nso, nvp, cr)) != 0) {
1882 eprintsoline(nso, error);
1883 error = 0;
1884 nso->so_pgrp = 0;
1885 }
1886 }
1887
1888 /*
1889 * Make note of the socket level options. TCP and IP level options
1890 * are already inherited. We could do all this after accept is
1891 * successful but doing it here simplifies code and no harm done
1892 * for error case.
1893 */
1894 nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1895 SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1896 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1897 nso->so_sndbuf = so->so_sndbuf;
1898 nso->so_rcvbuf = so->so_rcvbuf;
1899 if (nso->so_options & SO_LINGER)
1900 nso->so_linger = so->so_linger;
1901
1902 /*
1903 * Note that the following sti_direct code path should be
1904 * removed once we are confident that the direct sockets
1905 * do not result in any degradation.
1906 */
1907 if (sti->sti_direct) {
1908
1909 ASSERT(opt != NULL);
1910
1911 conn_res->OPT_length = optlen;
1912 conn_res->OPT_offset = MBLKL(mp);
1913 bcopy(&opt, mp->b_wptr, optlen);
1914 mp->b_wptr += optlen;
1915 conn_res->PRIM_type = T_CONN_RES;
1916 conn_res->ACCEPTOR_id = 0;
1917 PRIM_type = T_CONN_RES;
1918
1919 /* Send down the T_CONN_RES on acceptor STREAM */
1920 error = kstrputmsg(SOTOV(nso), mp, NULL,
1921 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1922 if (error) {
1923 mutex_enter(&so->so_lock);
1924 so_lock_single(so);
1925 eprintsoline(so, error);
1926 goto disconnect_vp;
1927 }
1928 mutex_enter(&nso->so_lock);
1929 error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1930 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1931 if (error) {
1932 mutex_exit(&nso->so_lock);
1933 mutex_enter(&so->so_lock);
1934 so_lock_single(so);
1935 eprintsoline(so, error);
1936 goto disconnect_vp;
1937 }
1938 if (nso->so_family == AF_INET) {
1939 sin_t *sin;
1940
1941 sin = (sin_t *)(ack_mp->b_rptr +
1942 sizeof (struct T_ok_ack));
1943 bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1944 nsti->sti_laddr_len = sizeof (sin_t);
1945 } else {
1946 sin6_t *sin6;
1947
1948 sin6 = (sin6_t *)(ack_mp->b_rptr +
1949 sizeof (struct T_ok_ack));
1950 bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1951 nsti->sti_laddr_len = sizeof (sin6_t);
1952 }
1953 freemsg(ack_mp);
1954
1955 nso->so_state |= SS_ISCONNECTED;
1956 nso->so_proto_handle = (sock_lower_handle_t)opt;
1957 nsti->sti_laddr_valid = 1;
1958
1959 if (sti->sti_nl7c_flags & NL7C_ENABLED) {
1960 /*
1961 * A NL7C marked listen()er so the new socket
1962 * inherits the listen()er's NL7C state, except
1963 * for NL7C_POLLIN.
1964 *
1965 * Only call NL7C to process the new socket if
1966 * the listen socket allows blocking i/o.
1967 */
1968 nsti->sti_nl7c_flags =
1969 sti->sti_nl7c_flags & (~NL7C_POLLIN);
1970 if (so->so_state & (SS_NONBLOCK|SS_NDELAY)) {
1971 /*
1972 * Nonblocking accept() just make it
1973 * persist to defer processing to the
1974 * read-side syscall (e.g. read).
1975 */
1976 nsti->sti_nl7c_flags |= NL7C_SOPERSIST;
1977 } else if (nl7c_process(nso, B_FALSE)) {
1978 /*
1979 * NL7C has completed processing on the
1980 * socket, close the socket and back to
1981 * the top to await the next T_CONN_IND.
1982 */
1983 mutex_exit(&nso->so_lock);
1984 (void) VOP_CLOSE(nvp, 0, 1, (offset_t)0,
1985 cr, NULL);
1986 VN_RELE(nvp);
1987 goto again;
1988 }
1989 /* Pass the new socket out */
1990 }
1991
1992 mutex_exit(&nso->so_lock);
1993
1994 /*
1995 * It's possible, through the use of autopush for example,
1996 * that the acceptor stream may not support sti_direct
1997 * semantics. If the new socket does not support sti_direct
1998 * we issue a _SIOCSOCKFALLBACK to inform the transport
1999 * as we would in the I_PUSH case.
2000 */
2001 if (nsti->sti_direct == 0) {
2002 int rval;
2003
2004 if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
2005 0, 0, K_TO_K, cr, &rval)) != 0) {
2006 mutex_enter(&so->so_lock);
2007 so_lock_single(so);
2008 eprintsoline(so, error);
2009 goto disconnect_vp;
2010 }
2011 }
2012
2013 /*
2014 * Pass out new socket.
2015 */
2016 if (nsop != NULL)
2017 *nsop = nso;
2018
2019 return (0);
2020 }
2021
2022 /*
2023 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
2024 * which don't support the FireEngine accept fast-path. It is also
2025 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
2026 * again. Neither sockfs nor TCP attempt to find out if some other
2027 * random module has been inserted in between (in which case we
2028 * should follow TLI accept behaviour). We blindly assume the worst
2029 * case and revert back to old behaviour i.e. TCP will not send us
2030 * any option (eager) and the accept should happen on the listener
2031 * queue. Any queued T_conn_ind have already got their options removed
2032 * by so_sock2_stream() when "sockmod" was I_POP'd.
2033 */
2034 /*
2035 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
2036 */
2037 if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
2038 #ifdef _ILP32
2039 queue_t *q;
2040
2041 /*
2042 * Find read queue in driver
2043 * Can safely do this since we "own" nso/nvp.
2044 */
2045 q = strvp2wq(nvp)->q_next;
2046 while (SAMESTR(q))
2047 q = q->q_next;
2048 q = RD(q);
2049 conn_res->ACCEPTOR_id = (t_uscalar_t)q;
2050 #else
2051 conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
2052 #endif /* _ILP32 */
2053 conn_res->PRIM_type = O_T_CONN_RES;
2054 PRIM_type = O_T_CONN_RES;
2055 } else {
2056 conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
2057 conn_res->PRIM_type = T_CONN_RES;
2058 PRIM_type = T_CONN_RES;
2059 }
2060 conn_res->SEQ_number = SEQ_number;
2061 conn_res->OPT_length = 0;
2062 conn_res->OPT_offset = 0;
2063
2064 mutex_enter(&so->so_lock);
2065 so_lock_single(so); /* Set SOLOCKED */
2066 mutex_exit(&so->so_lock);
2067
2068 error = kstrputmsg(SOTOV(so), mp, NULL,
2069 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2070 mutex_enter(&so->so_lock);
2071 if (error) {
2072 eprintsoline(so, error);
2073 goto disconnect_vp;
2074 }
2075 error = sowaitprim(so, PRIM_type, T_OK_ACK,
2076 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2077 if (error) {
2078 eprintsoline(so, error);
2079 goto disconnect_vp;
2080 }
2081 mutex_exit(&so->so_lock);
2082 /*
2083 * If there is a sin/sin6 appended onto the T_OK_ACK use
2084 * that to set the local address. If this is not present
2085 * then we zero out the address and don't set the
2086 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2087 * the pathname from the listening socket.
2088 * In the case where this is TCP or an AF_UNIX socket the
2089 * client side may have queued data or a T_ORDREL in the
2090 * transport. Having now sent the T_CONN_RES we may receive
2091 * those queued messages at any time. Hold the acceptor
2092 * so_lock until its state and laddr are finalized.
2093 */
2094 mutex_enter(&nso->so_lock);
2095 sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2096 if ((nso->so_family == AF_INET) || (nso->so_family == AF_INET6) &&
2097 MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2098 ack_mp->b_rptr += sizeof (struct T_ok_ack);
2099 bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2100 nsti->sti_laddr_len = sinlen;
2101 nsti->sti_laddr_valid = 1;
2102 } else if (nso->so_family == AF_UNIX) {
2103 ASSERT(so->so_family == AF_UNIX);
2104 nsti->sti_laddr_len = sti->sti_laddr_len;
2105 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2106 bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2107 nsti->sti_laddr_len);
2108 nsti->sti_laddr_valid = 1;
2109 } else {
2110 nsti->sti_laddr_len = sti->sti_laddr_len;
2111 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2112 bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2113 nsti->sti_laddr_sa->sa_family = nso->so_family;
2114 }
2115 nso->so_state |= SS_ISCONNECTED;
2116 mutex_exit(&nso->so_lock);
2117
2118 freemsg(ack_mp);
2119
2120 mutex_enter(&so->so_lock);
2121 so_unlock_single(so, SOLOCKED);
2122 mutex_exit(&so->so_lock);
2123
2124 /*
2125 * Pass out new socket.
2126 */
2127 if (nsop != NULL)
2128 *nsop = nso;
2129
2130 return (0);
2131
2132
2133 eproto_disc_unl:
2134 error = EPROTO;
2135 e_disc_unl:
2136 eprintsoline(so, error);
2137 goto disconnect_unlocked;
2138
2139 pr_disc_vp_unl:
2140 eprintsoline(so, error);
2141 disconnect_vp_unlocked:
2142 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2143 VN_RELE(nvp);
2144 disconnect_unlocked:
2145 (void) sodisconnect(so, SEQ_number, 0);
2146 return (error);
2147
2148 pr_disc_vp:
2149 eprintsoline(so, error);
2150 disconnect_vp:
2151 (void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2152 so_unlock_single(so, SOLOCKED);
2153 mutex_exit(&so->so_lock);
2154 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2155 VN_RELE(nvp);
2156 return (error);
2157
2158 conn_bad: /* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2159 error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2160 ? EOPNOTSUPP : EINVAL;
2161 e_bad:
2162 eprintsoline(so, error);
2163 return (error);
2164 }
2165
2166 /*
2167 * connect a socket.
2168 *
2169 * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2170 * unconnect (by specifying a null address).
2171 */
2172 int
sotpi_connect(struct sonode * so,struct sockaddr * name,socklen_t namelen,int fflag,int flags,struct cred * cr)2173 sotpi_connect(struct sonode *so,
2174 struct sockaddr *name,
2175 socklen_t namelen,
2176 int fflag,
2177 int flags,
2178 struct cred *cr)
2179 {
2180 struct T_conn_req conn_req;
2181 int error = 0;
2182 mblk_t *mp;
2183 void *src;
2184 socklen_t srclen;
2185 void *addr;
2186 socklen_t addrlen;
2187 boolean_t need_unlock;
2188 sotpi_info_t *sti = SOTOTPI(so);
2189
2190 dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2191 (void *)so, (void *)name, namelen, fflag, flags,
2192 pr_state(so->so_state, so->so_mode)));
2193
2194 /*
2195 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2196 * avoid sleeping for memory with SOLOCKED held.
2197 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2198 * + sizeof (struct T_opthdr).
2199 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2200 * exceed sti_faddr_maxlen).
2201 */
2202 mp = soallocproto(sizeof (struct T_conn_req) +
2203 2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2204 cr);
2205 if (mp == NULL) {
2206 /*
2207 * Connect can not fail with ENOBUFS. A signal was
2208 * caught so return EINTR.
2209 */
2210 error = EINTR;
2211 eprintsoline(so, error);
2212 return (error);
2213 }
2214
2215 mutex_enter(&so->so_lock);
2216 /*
2217 * Make sure there is a preallocated T_unbind_req message
2218 * before any binding. This message is allocated when the
2219 * socket is created. Since another thread can consume
2220 * so_unbind_mp by the time we return from so_lock_single(),
2221 * we should check the availability of so_unbind_mp after
2222 * we return from so_lock_single().
2223 */
2224
2225 so_lock_single(so); /* Set SOLOCKED */
2226 need_unlock = B_TRUE;
2227
2228 if (sti->sti_unbind_mp == NULL) {
2229 dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2230 /* NOTE: holding so_lock while sleeping */
2231 sti->sti_unbind_mp =
2232 soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2233 if (sti->sti_unbind_mp == NULL) {
2234 error = EINTR;
2235 goto done;
2236 }
2237 }
2238
2239 /*
2240 * Can't have done a listen before connecting.
2241 */
2242 if (so->so_state & SS_ACCEPTCONN) {
2243 error = EOPNOTSUPP;
2244 goto done;
2245 }
2246
2247 /*
2248 * Must be bound with the transport
2249 */
2250 if (!(so->so_state & SS_ISBOUND)) {
2251 if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2252 /*CONSTCOND*/
2253 so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2254 /*
2255 * Optimization for AF_INET{,6} transports
2256 * that can handle a T_CONN_REQ without being bound.
2257 */
2258 so_automatic_bind(so);
2259 } else {
2260 error = sotpi_bind(so, NULL, 0,
2261 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2262 if (error)
2263 goto done;
2264 }
2265 ASSERT(so->so_state & SS_ISBOUND);
2266 flags |= _SOCONNECT_DID_BIND;
2267 }
2268
2269 /*
2270 * Handle a connect to a name parameter of type AF_UNSPEC like a
2271 * connect to a null address. This is the portable method to
2272 * unconnect a socket.
2273 */
2274 if ((namelen >= sizeof (sa_family_t)) &&
2275 (name->sa_family == AF_UNSPEC)) {
2276 name = NULL;
2277 namelen = 0;
2278 }
2279
2280 /*
2281 * Check that we are not already connected.
2282 * A connection-oriented socket cannot be reconnected.
2283 * A connected connection-less socket can be
2284 * - connected to a different address by a subsequent connect
2285 * - "unconnected" by a connect to the NULL address
2286 */
2287 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2288 ASSERT(!(flags & _SOCONNECT_DID_BIND));
2289 if (so->so_mode & SM_CONNREQUIRED) {
2290 /* Connection-oriented socket */
2291 error = so->so_state & SS_ISCONNECTED ?
2292 EISCONN : EALREADY;
2293 goto done;
2294 }
2295 /* Connection-less socket */
2296 if (name == NULL) {
2297 /*
2298 * Remove the connected state and clear SO_DGRAM_ERRIND
2299 * since it was set when the socket was connected.
2300 * If this is UDP also send down a T_DISCON_REQ.
2301 */
2302 int val;
2303
2304 if ((so->so_family == AF_INET ||
2305 so->so_family == AF_INET6) &&
2306 (so->so_type == SOCK_DGRAM ||
2307 so->so_type == SOCK_RAW) &&
2308 /*CONSTCOND*/
2309 !soconnect_tpi_udp) {
2310 /* XXX What about implicitly unbinding here? */
2311 error = sodisconnect(so, -1,
2312 _SODISCONNECT_LOCK_HELD);
2313 } else {
2314 so->so_state &=
2315 ~(SS_ISCONNECTED | SS_ISCONNECTING);
2316 sti->sti_faddr_valid = 0;
2317 sti->sti_faddr_len = 0;
2318 }
2319
2320 /* Remove SOLOCKED since setsockopt will grab it */
2321 so_unlock_single(so, SOLOCKED);
2322 mutex_exit(&so->so_lock);
2323
2324 val = 0;
2325 (void) sotpi_setsockopt(so, SOL_SOCKET,
2326 SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2327 cr);
2328
2329 mutex_enter(&so->so_lock);
2330 so_lock_single(so); /* Set SOLOCKED */
2331 goto done;
2332 }
2333 }
2334 ASSERT(so->so_state & SS_ISBOUND);
2335
2336 if (name == NULL || namelen == 0) {
2337 error = EINVAL;
2338 goto done;
2339 }
2340 /*
2341 * Mark the socket if sti_faddr_sa represents the transport level
2342 * address.
2343 */
2344 if (flags & _SOCONNECT_NOXLATE) {
2345 struct sockaddr_ux *soaddr_ux;
2346
2347 ASSERT(so->so_family == AF_UNIX);
2348 if (namelen != sizeof (struct sockaddr_ux)) {
2349 error = EINVAL;
2350 goto done;
2351 }
2352 soaddr_ux = (struct sockaddr_ux *)name;
2353 name = (struct sockaddr *)&soaddr_ux->sou_addr;
2354 namelen = sizeof (soaddr_ux->sou_addr);
2355 sti->sti_faddr_noxlate = 1;
2356 }
2357
2358 /*
2359 * Length and family checks.
2360 */
2361 error = so_addr_verify(so, name, namelen);
2362 if (error)
2363 goto bad;
2364
2365 /*
2366 * Save foreign address. Needed for AF_UNIX as well as
2367 * transport providers that do not support TI_GETPEERNAME.
2368 * Also used for cached foreign address for TCP and UDP.
2369 */
2370 if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2371 error = EINVAL;
2372 goto done;
2373 }
2374 sti->sti_faddr_len = (socklen_t)namelen;
2375 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2376 bcopy(name, sti->sti_faddr_sa, namelen);
2377 sti->sti_faddr_valid = 1;
2378
2379 if (so->so_family == AF_UNIX) {
2380 if (sti->sti_faddr_noxlate) {
2381 /*
2382 * sti_faddr is a transport-level address, so
2383 * don't pass it as an option. Do save it in
2384 * sti_ux_faddr, used for connected DG send.
2385 */
2386 src = NULL;
2387 srclen = 0;
2388 addr = sti->sti_faddr_sa;
2389 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2390 bcopy(addr, &sti->sti_ux_faddr,
2391 sizeof (sti->sti_ux_faddr));
2392 } else {
2393 /*
2394 * Pass the sockaddr_un source address as an option
2395 * and translate the remote address.
2396 * Holding so_lock thus sti_laddr_sa can not change.
2397 */
2398 src = sti->sti_laddr_sa;
2399 srclen = (t_uscalar_t)sti->sti_laddr_len;
2400 dprintso(so, 1,
2401 ("sotpi_connect UNIX: srclen %d, src %p\n",
2402 srclen, src));
2403 /*
2404 * Translate the destination address into our
2405 * internal form, and save it in sti_ux_faddr.
2406 * After this call, addr==&sti->sti_ux_taddr,
2407 * and we copy that to sti->sti_ux_faddr so
2408 * we save the connected peer address.
2409 */
2410 error = so_ux_addr_xlate(so,
2411 sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2412 (flags & _SOCONNECT_XPG4_2),
2413 &addr, &addrlen);
2414 if (error)
2415 goto bad;
2416 bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2417 sizeof (sti->sti_ux_faddr));
2418 }
2419 } else {
2420 addr = sti->sti_faddr_sa;
2421 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2422 src = NULL;
2423 srclen = 0;
2424 }
2425 /*
2426 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2427 * option which asks the transport provider to send T_UDERR_IND
2428 * messages. These T_UDERR_IND messages are used to return connected
2429 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2430 *
2431 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2432 * we send down a T_CONN_REQ. This is needed to let the
2433 * transport assign a local address that is consistent with
2434 * the remote address. Applications depend on a getsockname()
2435 * after a connect() to retrieve the "source" IP address for
2436 * the connected socket. Invalidate the cached local address
2437 * to force getsockname() to enquire of the transport.
2438 */
2439 if (!(so->so_mode & SM_CONNREQUIRED)) {
2440 /*
2441 * Datagram socket.
2442 */
2443 int32_t val;
2444
2445 so_unlock_single(so, SOLOCKED);
2446 mutex_exit(&so->so_lock);
2447
2448 val = 1;
2449 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2450 &val, (t_uscalar_t)sizeof (val), cr);
2451
2452 mutex_enter(&so->so_lock);
2453 so_lock_single(so); /* Set SOLOCKED */
2454 if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2455 (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2456 soconnect_tpi_udp) {
2457 soisconnected(so);
2458 goto done;
2459 }
2460 /*
2461 * Send down T_CONN_REQ etc.
2462 * Clear fflag to avoid returning EWOULDBLOCK.
2463 */
2464 fflag = 0;
2465 ASSERT(so->so_family != AF_UNIX);
2466 sti->sti_laddr_valid = 0;
2467 } else if (sti->sti_laddr_len != 0) {
2468 /*
2469 * If the local address or port was "any" then it may be
2470 * changed by the transport as a result of the
2471 * connect. Invalidate the cached version if we have one.
2472 */
2473 switch (so->so_family) {
2474 case AF_INET:
2475 ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2476 if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2477 INADDR_ANY ||
2478 ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2479 sti->sti_laddr_valid = 0;
2480 break;
2481
2482 case AF_INET6:
2483 ASSERT(sti->sti_laddr_len ==
2484 (socklen_t)sizeof (sin6_t));
2485 if (IN6_IS_ADDR_UNSPECIFIED(
2486 &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2487 IN6_IS_ADDR_V4MAPPED_ANY(
2488 &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2489 ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2490 sti->sti_laddr_valid = 0;
2491 break;
2492
2493 default:
2494 break;
2495 }
2496 }
2497
2498 /*
2499 * Check for failure of an earlier call
2500 */
2501 if (so->so_error != 0)
2502 goto so_bad;
2503
2504 /*
2505 * Send down T_CONN_REQ. Message was allocated above.
2506 */
2507 conn_req.PRIM_type = T_CONN_REQ;
2508 conn_req.DEST_length = addrlen;
2509 conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2510 if (srclen == 0) {
2511 conn_req.OPT_length = 0;
2512 conn_req.OPT_offset = 0;
2513 soappendmsg(mp, &conn_req, sizeof (conn_req));
2514 soappendmsg(mp, addr, addrlen);
2515 } else {
2516 /*
2517 * There is a AF_UNIX sockaddr_un to include as a source
2518 * address option.
2519 */
2520 struct T_opthdr toh;
2521
2522 toh.level = SOL_SOCKET;
2523 toh.name = SO_SRCADDR;
2524 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2525 toh.status = 0;
2526 conn_req.OPT_length =
2527 (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2528 conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2529 _TPI_ALIGN_TOPT(addrlen));
2530
2531 soappendmsg(mp, &conn_req, sizeof (conn_req));
2532 soappendmsg(mp, addr, addrlen);
2533 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2534 soappendmsg(mp, &toh, sizeof (toh));
2535 soappendmsg(mp, src, srclen);
2536 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2537 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2538 }
2539 /*
2540 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2541 * in order to have the right state when the T_CONN_CON shows up.
2542 */
2543 soisconnecting(so);
2544 mutex_exit(&so->so_lock);
2545
2546 if (AU_AUDITING())
2547 audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2548
2549 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2550 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2551 mp = NULL;
2552 mutex_enter(&so->so_lock);
2553 if (error != 0)
2554 goto bad;
2555
2556 if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2557 goto bad;
2558
2559 /* Allow other threads to access the socket */
2560 so_unlock_single(so, SOLOCKED);
2561 need_unlock = B_FALSE;
2562
2563 /*
2564 * Wait until we get a T_CONN_CON or an error
2565 */
2566 if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2567 so_lock_single(so); /* Set SOLOCKED */
2568 need_unlock = B_TRUE;
2569 }
2570
2571 done:
2572 freemsg(mp);
2573 switch (error) {
2574 case EINPROGRESS:
2575 case EALREADY:
2576 case EISCONN:
2577 case EINTR:
2578 /* Non-fatal errors */
2579 sti->sti_laddr_valid = 0;
2580 /* FALLTHRU */
2581 case 0:
2582 break;
2583 default:
2584 ASSERT(need_unlock);
2585 /*
2586 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2587 * and invalidate local-address cache
2588 */
2589 so->so_state &= ~SS_ISCONNECTING;
2590 sti->sti_laddr_valid = 0;
2591 /* A discon_ind might have already unbound us */
2592 if ((flags & _SOCONNECT_DID_BIND) &&
2593 (so->so_state & SS_ISBOUND)) {
2594 int err;
2595
2596 err = sotpi_unbind(so, 0);
2597 /* LINTED - statement has no conseq */
2598 if (err) {
2599 eprintsoline(so, err);
2600 }
2601 }
2602 break;
2603 }
2604 if (need_unlock)
2605 so_unlock_single(so, SOLOCKED);
2606 mutex_exit(&so->so_lock);
2607 return (error);
2608
2609 so_bad: error = sogeterr(so, B_TRUE);
2610 bad: eprintsoline(so, error);
2611 goto done;
2612 }
2613
2614 /* ARGSUSED */
2615 int
sotpi_shutdown(struct sonode * so,int how,struct cred * cr)2616 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2617 {
2618 struct T_ordrel_req ordrel_req;
2619 mblk_t *mp;
2620 uint_t old_state, state_change;
2621 int error = 0;
2622 sotpi_info_t *sti = SOTOTPI(so);
2623
2624 dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2625 (void *)so, how, pr_state(so->so_state, so->so_mode)));
2626
2627 mutex_enter(&so->so_lock);
2628 so_lock_single(so); /* Set SOLOCKED */
2629
2630 /*
2631 * SunOS 4.X has no check for datagram sockets.
2632 * 5.X checks that it is connected (ENOTCONN)
2633 * X/Open requires that we check the connected state.
2634 */
2635 if (!(so->so_state & SS_ISCONNECTED)) {
2636 if (!xnet_skip_checks) {
2637 error = ENOTCONN;
2638 if (xnet_check_print) {
2639 printf("sockfs: X/Open shutdown check "
2640 "caused ENOTCONN\n");
2641 }
2642 }
2643 goto done;
2644 }
2645 /*
2646 * Record the current state and then perform any state changes.
2647 * Then use the difference between the old and new states to
2648 * determine which messages need to be sent.
2649 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2650 * duplicate calls to shutdown().
2651 */
2652 old_state = so->so_state;
2653
2654 switch (how) {
2655 case 0:
2656 socantrcvmore(so);
2657 break;
2658 case 1:
2659 socantsendmore(so);
2660 break;
2661 case 2:
2662 socantsendmore(so);
2663 socantrcvmore(so);
2664 break;
2665 default:
2666 error = EINVAL;
2667 goto done;
2668 }
2669
2670 /*
2671 * Assumes that the SS_CANT* flags are never cleared in the above code.
2672 */
2673 state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2674 (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2675 ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2676
2677 switch (state_change) {
2678 case 0:
2679 dprintso(so, 1,
2680 ("sotpi_shutdown: nothing to send in state 0x%x\n",
2681 so->so_state));
2682 goto done;
2683
2684 case SS_CANTRCVMORE:
2685 mutex_exit(&so->so_lock);
2686 strseteof(SOTOV(so), 1);
2687 /*
2688 * strseteof takes care of read side wakeups,
2689 * pollwakeups, and signals.
2690 */
2691 /*
2692 * Get the read lock before flushing data to avoid problems
2693 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2694 */
2695 mutex_enter(&so->so_lock);
2696 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2697 mutex_exit(&so->so_lock);
2698
2699 /* Flush read side queue */
2700 strflushrq(SOTOV(so), FLUSHALL);
2701
2702 mutex_enter(&so->so_lock);
2703 so_unlock_read(so); /* Clear SOREADLOCKED */
2704 break;
2705
2706 case SS_CANTSENDMORE:
2707 mutex_exit(&so->so_lock);
2708 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2709 mutex_enter(&so->so_lock);
2710 break;
2711
2712 case SS_CANTSENDMORE|SS_CANTRCVMORE:
2713 mutex_exit(&so->so_lock);
2714 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2715 strseteof(SOTOV(so), 1);
2716 /*
2717 * strseteof takes care of read side wakeups,
2718 * pollwakeups, and signals.
2719 */
2720 /*
2721 * Get the read lock before flushing data to avoid problems
2722 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2723 */
2724 mutex_enter(&so->so_lock);
2725 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2726 mutex_exit(&so->so_lock);
2727
2728 /* Flush read side queue */
2729 strflushrq(SOTOV(so), FLUSHALL);
2730
2731 mutex_enter(&so->so_lock);
2732 so_unlock_read(so); /* Clear SOREADLOCKED */
2733 break;
2734 }
2735
2736 ASSERT(MUTEX_HELD(&so->so_lock));
2737
2738 /*
2739 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2740 * was set due to this call and the new state has both of them set:
2741 * Send the AF_UNIX close indication
2742 * For T_COTS send a discon_ind
2743 *
2744 * If cantsend was set due to this call:
2745 * For T_COTSORD send an ordrel_ind
2746 *
2747 * Note that for T_CLTS there is no message sent here.
2748 */
2749 if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2750 (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2751 /*
2752 * For SunOS 4.X compatibility we tell the other end
2753 * that we are unable to receive at this point.
2754 */
2755 if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2756 so_unix_close(so);
2757
2758 if (sti->sti_serv_type == T_COTS)
2759 error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2760 }
2761 if ((state_change & SS_CANTSENDMORE) &&
2762 (sti->sti_serv_type == T_COTS_ORD)) {
2763 /* Send an orderly release */
2764 ordrel_req.PRIM_type = T_ORDREL_REQ;
2765
2766 mutex_exit(&so->so_lock);
2767 mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2768 0, _ALLOC_SLEEP, cr);
2769 /*
2770 * Send down the T_ORDREL_REQ even if there is flow control.
2771 * This prevents shutdown from blocking.
2772 * Note that there is no T_OK_ACK for ordrel_req.
2773 */
2774 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2775 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2776 mutex_enter(&so->so_lock);
2777 if (error) {
2778 eprintsoline(so, error);
2779 goto done;
2780 }
2781 }
2782
2783 done:
2784 so_unlock_single(so, SOLOCKED);
2785 mutex_exit(&so->so_lock);
2786 return (error);
2787 }
2788
2789 /*
2790 * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2791 * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2792 * that we have closed.
2793 * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2794 * T_UNITDATA_REQ containing the same option.
2795 *
2796 * For SOCK_DGRAM half-connections (somebody connected to this end
2797 * but this end is not connect) we don't know where to send any
2798 * SO_UNIX_CLOSE.
2799 *
2800 * We have to ignore stream head errors just in case there has been
2801 * a shutdown(output).
2802 * Ignore any flow control to try to get the message more quickly to the peer.
2803 * While locally ignoring flow control solves the problem when there
2804 * is only the loopback transport on the stream it would not provide
2805 * the correct AF_UNIX socket semantics when one or more modules have
2806 * been pushed.
2807 */
2808 void
so_unix_close(struct sonode * so)2809 so_unix_close(struct sonode *so)
2810 {
2811 struct T_opthdr toh;
2812 mblk_t *mp;
2813 sotpi_info_t *sti = SOTOTPI(so);
2814
2815 ASSERT(MUTEX_HELD(&so->so_lock));
2816
2817 ASSERT(so->so_family == AF_UNIX);
2818
2819 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2820 (SS_ISCONNECTED|SS_ISBOUND))
2821 return;
2822
2823 dprintso(so, 1, ("so_unix_close(%p) %s\n",
2824 (void *)so, pr_state(so->so_state, so->so_mode)));
2825
2826 toh.level = SOL_SOCKET;
2827 toh.name = SO_UNIX_CLOSE;
2828
2829 /* zero length + header */
2830 toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2831 toh.status = 0;
2832
2833 if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2834 struct T_optdata_req tdr;
2835
2836 tdr.PRIM_type = T_OPTDATA_REQ;
2837 tdr.DATA_flag = 0;
2838
2839 tdr.OPT_length = (t_scalar_t)sizeof (toh);
2840 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2841
2842 /* NOTE: holding so_lock while sleeping */
2843 mp = soallocproto2(&tdr, sizeof (tdr),
2844 &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2845 } else {
2846 struct T_unitdata_req tudr;
2847 void *addr;
2848 socklen_t addrlen;
2849 void *src;
2850 socklen_t srclen;
2851 struct T_opthdr toh2;
2852 t_scalar_t size;
2853
2854 /*
2855 * We know this is an AF_UNIX connected DGRAM socket.
2856 * We therefore already have the destination address
2857 * in the internal form needed for this send. This is
2858 * similar to the sosend_dgram call later in this file
2859 * when there's no user-specified destination address.
2860 */
2861 if (sti->sti_faddr_noxlate) {
2862 /*
2863 * Already have a transport internal address. Do not
2864 * pass any (transport internal) source address.
2865 */
2866 addr = sti->sti_faddr_sa;
2867 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2868 src = NULL;
2869 srclen = 0;
2870 } else {
2871 /*
2872 * Pass the sockaddr_un source address as an option
2873 * and translate the remote address.
2874 * Holding so_lock thus sti_laddr_sa can not change.
2875 */
2876 src = sti->sti_laddr_sa;
2877 srclen = (socklen_t)sti->sti_laddr_len;
2878 dprintso(so, 1,
2879 ("so_ux_close: srclen %d, src %p\n",
2880 srclen, src));
2881 /*
2882 * Use the destination address saved in connect.
2883 */
2884 addr = &sti->sti_ux_faddr;
2885 addrlen = sizeof (sti->sti_ux_faddr);
2886 }
2887 tudr.PRIM_type = T_UNITDATA_REQ;
2888 tudr.DEST_length = addrlen;
2889 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2890 if (srclen == 0) {
2891 tudr.OPT_length = (t_scalar_t)sizeof (toh);
2892 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2893 _TPI_ALIGN_TOPT(addrlen));
2894
2895 size = tudr.OPT_offset + tudr.OPT_length;
2896 /* NOTE: holding so_lock while sleeping */
2897 mp = soallocproto2(&tudr, sizeof (tudr),
2898 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2899 mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2900 soappendmsg(mp, &toh, sizeof (toh));
2901 } else {
2902 /*
2903 * There is a AF_UNIX sockaddr_un to include as a
2904 * source address option.
2905 */
2906 tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2907 _TPI_ALIGN_TOPT(srclen));
2908 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2909 _TPI_ALIGN_TOPT(addrlen));
2910
2911 toh2.level = SOL_SOCKET;
2912 toh2.name = SO_SRCADDR;
2913 toh2.len = (t_uscalar_t)(srclen +
2914 sizeof (struct T_opthdr));
2915 toh2.status = 0;
2916
2917 size = tudr.OPT_offset + tudr.OPT_length;
2918
2919 /* NOTE: holding so_lock while sleeping */
2920 mp = soallocproto2(&tudr, sizeof (tudr),
2921 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2922 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2923 soappendmsg(mp, &toh, sizeof (toh));
2924 soappendmsg(mp, &toh2, sizeof (toh2));
2925 soappendmsg(mp, src, srclen);
2926 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2927 }
2928 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2929 }
2930 mutex_exit(&so->so_lock);
2931 (void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2932 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2933 mutex_enter(&so->so_lock);
2934 }
2935
2936 /*
2937 * Called by sotpi_recvmsg when reading a non-zero amount of data.
2938 * In addition, the caller typically verifies that there is some
2939 * potential state to clear by checking
2940 * if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2941 * before calling this routine.
2942 * Note that such a check can be made without holding so_lock since
2943 * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2944 * decrements sti_oobsigcnt.
2945 *
2946 * When data is read *after* the point that all pending
2947 * oob data has been consumed the oob indication is cleared.
2948 *
2949 * This logic keeps select/poll returning POLLRDBAND and
2950 * SIOCATMARK returning true until we have read past
2951 * the mark.
2952 */
2953 static void
sorecv_update_oobstate(struct sonode * so)2954 sorecv_update_oobstate(struct sonode *so)
2955 {
2956 sotpi_info_t *sti = SOTOTPI(so);
2957
2958 mutex_enter(&so->so_lock);
2959 ASSERT(so_verify_oobstate(so));
2960 dprintso(so, 1,
2961 ("sorecv_update_oobstate: counts %d/%d state %s\n",
2962 sti->sti_oobsigcnt,
2963 sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2964 if (sti->sti_oobsigcnt == 0) {
2965 /* No more pending oob indications */
2966 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2967 freemsg(so->so_oobmsg);
2968 so->so_oobmsg = NULL;
2969 }
2970 ASSERT(so_verify_oobstate(so));
2971 mutex_exit(&so->so_lock);
2972 }
2973
2974 /*
2975 * Handle recv* calls for an so which has NL7C saved recv mblk_t(s).
2976 */
2977 static int
nl7c_sorecv(struct sonode * so,mblk_t ** rmp,uio_t * uiop,rval_t * rp)2978 nl7c_sorecv(struct sonode *so, mblk_t **rmp, uio_t *uiop, rval_t *rp)
2979 {
2980 sotpi_info_t *sti = SOTOTPI(so);
2981 int error = 0;
2982 mblk_t *tmp = NULL;
2983 mblk_t *pmp = NULL;
2984 mblk_t *nmp = sti->sti_nl7c_rcv_mp;
2985
2986 ASSERT(nmp != NULL);
2987
2988 while (nmp != NULL && uiop->uio_resid > 0) {
2989 ssize_t n;
2990
2991 if (DB_TYPE(nmp) == M_DATA) {
2992 /*
2993 * We have some data, uiomove up to resid bytes.
2994 */
2995 n = MIN(MBLKL(nmp), uiop->uio_resid);
2996 if (n > 0)
2997 error = uiomove(nmp->b_rptr, n, UIO_READ, uiop);
2998 nmp->b_rptr += n;
2999 if (nmp->b_rptr == nmp->b_wptr) {
3000 pmp = nmp;
3001 nmp = nmp->b_cont;
3002 }
3003 if (error)
3004 break;
3005 } else {
3006 /*
3007 * We only handle data, save for caller to handle.
3008 */
3009 if (pmp != NULL) {
3010 pmp->b_cont = nmp->b_cont;
3011 }
3012 nmp->b_cont = NULL;
3013 if (*rmp == NULL) {
3014 *rmp = nmp;
3015 } else {
3016 tmp->b_cont = nmp;
3017 }
3018 nmp = nmp->b_cont;
3019 tmp = nmp;
3020 }
3021 }
3022 if (pmp != NULL) {
3023 /* Free any mblk_t(s) which we have consumed */
3024 pmp->b_cont = NULL;
3025 freemsg(sti->sti_nl7c_rcv_mp);
3026 }
3027 if ((sti->sti_nl7c_rcv_mp = nmp) == NULL) {
3028 /* Last mblk_t so return the saved kstrgetmsg() rval/error */
3029 if (error == 0) {
3030 rval_t *p = (rval_t *)&sti->sti_nl7c_rcv_rval;
3031
3032 error = p->r_v.r_v2;
3033 p->r_v.r_v2 = 0;
3034 }
3035 rp->r_vals = sti->sti_nl7c_rcv_rval;
3036 sti->sti_nl7c_rcv_rval = 0;
3037 } else {
3038 /* More mblk_t(s) to process so no rval to return */
3039 rp->r_vals = 0;
3040 }
3041 return (error);
3042 }
3043 /*
3044 * Receive the next message on the queue.
3045 * If msg_controllen is non-zero when called the caller is interested in
3046 * any received control info (options).
3047 * If msg_namelen is non-zero when called the caller is interested in
3048 * any received source address.
3049 * The routine returns with msg_control and msg_name pointing to
3050 * kmem_alloc'ed memory which the caller has to free.
3051 */
3052 /* ARGSUSED */
3053 int
sotpi_recvmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)3054 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
3055 struct cred *cr)
3056 {
3057 union T_primitives *tpr;
3058 mblk_t *mp;
3059 uchar_t pri;
3060 int pflag, opflag;
3061 void *control;
3062 t_uscalar_t controllen;
3063 t_uscalar_t namelen;
3064 int so_state = so->so_state; /* Snapshot */
3065 ssize_t saved_resid;
3066 rval_t rval;
3067 int flags;
3068 clock_t timout;
3069 int error = 0;
3070 sotpi_info_t *sti = SOTOTPI(so);
3071
3072 flags = msg->msg_flags;
3073 msg->msg_flags = 0;
3074
3075 dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
3076 (void *)so, (void *)msg, flags,
3077 pr_state(so->so_state, so->so_mode), so->so_error));
3078
3079 if (so->so_version == SOV_STREAM) {
3080 so_update_attrs(so, SOACC);
3081 /* The imaginary "sockmod" has been popped - act as a stream */
3082 return (strread(SOTOV(so), uiop, cr));
3083 }
3084
3085 /*
3086 * If we are not connected because we have never been connected
3087 * we return ENOTCONN. If we have been connected (but are no longer
3088 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
3089 * the EOF.
3090 *
3091 * An alternative would be to post an ENOTCONN error in stream head
3092 * (read+write) and clear it when we're connected. However, that error
3093 * would cause incorrect poll/select behavior!
3094 */
3095 if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
3096 (so->so_mode & SM_CONNREQUIRED)) {
3097 return (ENOTCONN);
3098 }
3099
3100 /*
3101 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
3102 * after checking that the read queue is empty) and returns zero.
3103 * This implementation will sleep (in kstrgetmsg) even if uio_resid
3104 * is zero.
3105 */
3106
3107 if (flags & MSG_OOB) {
3108 /* Check that the transport supports OOB */
3109 if (!(so->so_mode & SM_EXDATA))
3110 return (EOPNOTSUPP);
3111 so_update_attrs(so, SOACC);
3112 return (sorecvoob(so, msg, uiop, flags,
3113 (so->so_options & SO_OOBINLINE)));
3114 }
3115
3116 so_update_attrs(so, SOACC);
3117
3118 /*
3119 * Set msg_controllen and msg_namelen to zero here to make it
3120 * simpler in the cases that no control or name is returned.
3121 */
3122 controllen = msg->msg_controllen;
3123 namelen = msg->msg_namelen;
3124 msg->msg_controllen = 0;
3125 msg->msg_namelen = 0;
3126
3127 dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
3128 namelen, controllen));
3129
3130 mutex_enter(&so->so_lock);
3131 /*
3132 * If an NL7C enabled socket and not waiting for write data.
3133 */
3134 if ((sti->sti_nl7c_flags & (NL7C_ENABLED | NL7C_WAITWRITE)) ==
3135 NL7C_ENABLED) {
3136 if (sti->sti_nl7c_uri) {
3137 /* Close uri processing for a previous request */
3138 nl7c_close(so);
3139 }
3140 if ((so_state & SS_CANTRCVMORE) &&
3141 sti->sti_nl7c_rcv_mp == NULL) {
3142 /* Nothing to process, EOF */
3143 mutex_exit(&so->so_lock);
3144 return (0);
3145 } else if (sti->sti_nl7c_flags & NL7C_SOPERSIST) {
3146 /* Persistent NL7C socket, try to process request */
3147 boolean_t ret;
3148
3149 ret = nl7c_process(so,
3150 (so->so_state & (SS_NONBLOCK|SS_NDELAY)));
3151 rval.r_vals = sti->sti_nl7c_rcv_rval;
3152 error = rval.r_v.r_v2;
3153 if (error) {
3154 /* Error of some sort, return it */
3155 mutex_exit(&so->so_lock);
3156 return (error);
3157 }
3158 if (sti->sti_nl7c_flags &&
3159 ! (sti->sti_nl7c_flags & NL7C_WAITWRITE)) {
3160 /*
3161 * Still an NL7C socket and no data
3162 * to pass up to the caller.
3163 */
3164 mutex_exit(&so->so_lock);
3165 if (ret) {
3166 /* EOF */
3167 return (0);
3168 } else {
3169 /* Need more data */
3170 return (EAGAIN);
3171 }
3172 }
3173 } else {
3174 /*
3175 * Not persistent so no further NL7C processing.
3176 */
3177 sti->sti_nl7c_flags = 0;
3178 }
3179 }
3180 /*
3181 * Only one reader is allowed at any given time. This is needed
3182 * for T_EXDATA handling and, in the future, MSG_WAITALL.
3183 *
3184 * This is slightly different that BSD behavior in that it fails with
3185 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
3186 * is single-threaded using sblock(), which is dropped while waiting
3187 * for data to appear. The difference shows up e.g. if one
3188 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3189 * does use nonblocking io and different threads are reading each
3190 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3191 * in this case as long as the read queue doesn't get empty.
3192 * In this implementation the thread using nonblocking io can
3193 * get an EWOULDBLOCK error due to the blocking thread executing
3194 * e.g. in the uiomove in kstrgetmsg.
3195 * This difference is not believed to be significant.
3196 */
3197 /* Set SOREADLOCKED */
3198 error = so_lock_read_intr(so,
3199 uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3200 mutex_exit(&so->so_lock);
3201 if (error)
3202 return (error);
3203
3204 /*
3205 * Tell kstrgetmsg to not inspect the stream head errors until all
3206 * queued data has been consumed.
3207 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3208 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3209 *
3210 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3211 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3212 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3213 */
3214 pflag = MSG_ANY | MSG_DELAYERROR;
3215 if (flags & MSG_PEEK) {
3216 pflag |= MSG_IPEEK;
3217 flags &= ~MSG_WAITALL;
3218 }
3219 if (so->so_mode & SM_ATOMIC)
3220 pflag |= MSG_DISCARDTAIL;
3221
3222 if (flags & MSG_DONTWAIT)
3223 timout = 0;
3224 else if (so->so_rcvtimeo != 0)
3225 timout = TICK_TO_MSEC(so->so_rcvtimeo);
3226 else
3227 timout = -1;
3228 opflag = pflag;
3229 retry:
3230 saved_resid = uiop->uio_resid;
3231 pri = 0;
3232 mp = NULL;
3233 if (sti->sti_nl7c_rcv_mp != NULL) {
3234 /* Already kstrgetmsg()ed saved mblk(s) from NL7C */
3235 error = nl7c_sorecv(so, &mp, uiop, &rval);
3236 } else {
3237 error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3238 timout, &rval);
3239 }
3240 if (error != 0) {
3241 /* kstrgetmsg returns ETIME when timeout expires */
3242 if (error == ETIME)
3243 error = EWOULDBLOCK;
3244 goto out;
3245 }
3246 /*
3247 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3248 * For non-datagrams MOREDATA is used to set MSG_EOR.
3249 */
3250 ASSERT(!(rval.r_val1 & MORECTL));
3251 if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3252 msg->msg_flags |= MSG_TRUNC;
3253
3254 if (mp == NULL) {
3255 dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3256 /*
3257 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3258 * The draft Posix socket spec states that the mark should
3259 * not be cleared when peeking. We follow the latter.
3260 */
3261 if ((so->so_state &
3262 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3263 (uiop->uio_resid != saved_resid) &&
3264 !(flags & MSG_PEEK)) {
3265 sorecv_update_oobstate(so);
3266 }
3267
3268 mutex_enter(&so->so_lock);
3269 /* Set MSG_EOR based on MOREDATA */
3270 if (!(rval.r_val1 & MOREDATA)) {
3271 if (so->so_state & SS_SAVEDEOR) {
3272 msg->msg_flags |= MSG_EOR;
3273 so->so_state &= ~SS_SAVEDEOR;
3274 }
3275 }
3276 /*
3277 * If some data was received (i.e. not EOF) and the
3278 * read/recv* has not been satisfied wait for some more.
3279 */
3280 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3281 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3282 mutex_exit(&so->so_lock);
3283 pflag = opflag | MSG_NOMARK;
3284 goto retry;
3285 }
3286 goto out_locked;
3287 }
3288
3289 /* strsock_proto has already verified length and alignment */
3290 tpr = (union T_primitives *)mp->b_rptr;
3291 dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3292
3293 switch (tpr->type) {
3294 case T_DATA_IND: {
3295 if ((so->so_state &
3296 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3297 (uiop->uio_resid != saved_resid) &&
3298 !(flags & MSG_PEEK)) {
3299 sorecv_update_oobstate(so);
3300 }
3301
3302 /*
3303 * Set msg_flags to MSG_EOR based on
3304 * MORE_flag and MOREDATA.
3305 */
3306 mutex_enter(&so->so_lock);
3307 so->so_state &= ~SS_SAVEDEOR;
3308 if (!(tpr->data_ind.MORE_flag & 1)) {
3309 if (!(rval.r_val1 & MOREDATA))
3310 msg->msg_flags |= MSG_EOR;
3311 else
3312 so->so_state |= SS_SAVEDEOR;
3313 }
3314 freemsg(mp);
3315 /*
3316 * If some data was received (i.e. not EOF) and the
3317 * read/recv* has not been satisfied wait for some more.
3318 */
3319 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3320 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3321 mutex_exit(&so->so_lock);
3322 pflag = opflag | MSG_NOMARK;
3323 goto retry;
3324 }
3325 goto out_locked;
3326 }
3327 case T_UNITDATA_IND: {
3328 void *addr;
3329 t_uscalar_t addrlen;
3330 void *abuf;
3331 t_uscalar_t optlen;
3332 void *opt;
3333
3334 if ((so->so_state &
3335 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3336 (uiop->uio_resid != saved_resid) &&
3337 !(flags & MSG_PEEK)) {
3338 sorecv_update_oobstate(so);
3339 }
3340
3341 if (namelen != 0) {
3342 /* Caller wants source address */
3343 addrlen = tpr->unitdata_ind.SRC_length;
3344 addr = sogetoff(mp,
3345 tpr->unitdata_ind.SRC_offset,
3346 addrlen, 1);
3347 if (addr == NULL) {
3348 freemsg(mp);
3349 error = EPROTO;
3350 eprintsoline(so, error);
3351 goto out;
3352 }
3353 if (so->so_family == AF_UNIX) {
3354 /*
3355 * Can not use the transport level address.
3356 * If there is a SO_SRCADDR option carrying
3357 * the socket level address it will be
3358 * extracted below.
3359 */
3360 addr = NULL;
3361 addrlen = 0;
3362 }
3363 }
3364 optlen = tpr->unitdata_ind.OPT_length;
3365 if (optlen != 0) {
3366 t_uscalar_t ncontrollen;
3367
3368 /*
3369 * Extract any source address option.
3370 * Determine how large cmsg buffer is needed.
3371 */
3372 opt = sogetoff(mp,
3373 tpr->unitdata_ind.OPT_offset,
3374 optlen, __TPI_ALIGN_SIZE);
3375
3376 if (opt == NULL) {
3377 freemsg(mp);
3378 error = EPROTO;
3379 eprintsoline(so, error);
3380 goto out;
3381 }
3382 if (so->so_family == AF_UNIX)
3383 so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3384 ncontrollen = so_cmsglen(mp, opt, optlen,
3385 !(flags & MSG_XPG4_2));
3386 if (controllen != 0)
3387 controllen = ncontrollen;
3388 else if (ncontrollen != 0)
3389 msg->msg_flags |= MSG_CTRUNC;
3390 } else {
3391 controllen = 0;
3392 }
3393
3394 if (namelen != 0) {
3395 /*
3396 * Return address to caller.
3397 * Caller handles truncation if length
3398 * exceeds msg_namelen.
3399 * NOTE: AF_UNIX NUL termination is ensured by
3400 * the sender's copyin_name().
3401 */
3402 abuf = kmem_alloc(addrlen, KM_SLEEP);
3403
3404 bcopy(addr, abuf, addrlen);
3405 msg->msg_name = abuf;
3406 msg->msg_namelen = addrlen;
3407 }
3408
3409 if (controllen != 0) {
3410 /*
3411 * Return control msg to caller.
3412 * Caller handles truncation if length
3413 * exceeds msg_controllen.
3414 */
3415 control = kmem_zalloc(controllen, KM_SLEEP);
3416
3417 error = so_opt2cmsg(mp, opt, optlen,
3418 !(flags & MSG_XPG4_2),
3419 control, controllen);
3420 if (error) {
3421 freemsg(mp);
3422 if (msg->msg_namelen != 0)
3423 kmem_free(msg->msg_name,
3424 msg->msg_namelen);
3425 kmem_free(control, controllen);
3426 eprintsoline(so, error);
3427 goto out;
3428 }
3429 msg->msg_control = control;
3430 msg->msg_controllen = controllen;
3431 }
3432
3433 freemsg(mp);
3434 goto out;
3435 }
3436 case T_OPTDATA_IND: {
3437 struct T_optdata_req *tdr;
3438 void *opt;
3439 t_uscalar_t optlen;
3440
3441 if ((so->so_state &
3442 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3443 (uiop->uio_resid != saved_resid) &&
3444 !(flags & MSG_PEEK)) {
3445 sorecv_update_oobstate(so);
3446 }
3447
3448 tdr = (struct T_optdata_req *)mp->b_rptr;
3449 optlen = tdr->OPT_length;
3450 if (optlen != 0) {
3451 t_uscalar_t ncontrollen;
3452 /*
3453 * Determine how large cmsg buffer is needed.
3454 */
3455 opt = sogetoff(mp,
3456 tpr->optdata_ind.OPT_offset,
3457 optlen, __TPI_ALIGN_SIZE);
3458
3459 if (opt == NULL) {
3460 freemsg(mp);
3461 error = EPROTO;
3462 eprintsoline(so, error);
3463 goto out;
3464 }
3465
3466 ncontrollen = so_cmsglen(mp, opt, optlen,
3467 !(flags & MSG_XPG4_2));
3468 if (controllen != 0)
3469 controllen = ncontrollen;
3470 else if (ncontrollen != 0)
3471 msg->msg_flags |= MSG_CTRUNC;
3472 } else {
3473 controllen = 0;
3474 }
3475
3476 if (controllen != 0) {
3477 /*
3478 * Return control msg to caller.
3479 * Caller handles truncation if length
3480 * exceeds msg_controllen.
3481 */
3482 control = kmem_zalloc(controllen, KM_SLEEP);
3483
3484 error = so_opt2cmsg(mp, opt, optlen,
3485 !(flags & MSG_XPG4_2),
3486 control, controllen);
3487 if (error) {
3488 freemsg(mp);
3489 kmem_free(control, controllen);
3490 eprintsoline(so, error);
3491 goto out;
3492 }
3493 msg->msg_control = control;
3494 msg->msg_controllen = controllen;
3495 }
3496
3497 /*
3498 * Set msg_flags to MSG_EOR based on
3499 * DATA_flag and MOREDATA.
3500 */
3501 mutex_enter(&so->so_lock);
3502 so->so_state &= ~SS_SAVEDEOR;
3503 if (!(tpr->data_ind.MORE_flag & 1)) {
3504 if (!(rval.r_val1 & MOREDATA))
3505 msg->msg_flags |= MSG_EOR;
3506 else
3507 so->so_state |= SS_SAVEDEOR;
3508 }
3509 freemsg(mp);
3510 /*
3511 * If some data was received (i.e. not EOF) and the
3512 * read/recv* has not been satisfied wait for some more.
3513 * Not possible to wait if control info was received.
3514 */
3515 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3516 controllen == 0 &&
3517 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3518 mutex_exit(&so->so_lock);
3519 pflag = opflag | MSG_NOMARK;
3520 goto retry;
3521 }
3522 goto out_locked;
3523 }
3524 case T_EXDATA_IND: {
3525 dprintso(so, 1,
3526 ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3527 "state %s\n",
3528 sti->sti_oobsigcnt, sti->sti_oobcnt,
3529 saved_resid - uiop->uio_resid,
3530 pr_state(so->so_state, so->so_mode)));
3531 /*
3532 * kstrgetmsg handles MSGMARK so there is nothing to
3533 * inspect in the T_EXDATA_IND.
3534 * strsock_proto makes the stream head queue the T_EXDATA_IND
3535 * as a separate message with no M_DATA component. Furthermore,
3536 * the stream head does not consolidate M_DATA messages onto
3537 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3538 * remains a message by itself. This is needed since MSGMARK
3539 * marks both the whole message as well as the last byte
3540 * of the message.
3541 */
3542 freemsg(mp);
3543 ASSERT(uiop->uio_resid == saved_resid); /* No data */
3544 if (flags & MSG_PEEK) {
3545 /*
3546 * Even though we are peeking we consume the
3547 * T_EXDATA_IND thereby moving the mark information
3548 * to SS_RCVATMARK. Then the oob code below will
3549 * retry the peeking kstrgetmsg.
3550 * Note that the stream head read queue is
3551 * never flushed without holding SOREADLOCKED
3552 * thus the T_EXDATA_IND can not disappear
3553 * underneath us.
3554 */
3555 dprintso(so, 1,
3556 ("sotpi_recvmsg: consume EXDATA_IND "
3557 "counts %d/%d state %s\n",
3558 sti->sti_oobsigcnt,
3559 sti->sti_oobcnt,
3560 pr_state(so->so_state, so->so_mode)));
3561
3562 pflag = MSG_ANY | MSG_DELAYERROR;
3563 if (so->so_mode & SM_ATOMIC)
3564 pflag |= MSG_DISCARDTAIL;
3565
3566 pri = 0;
3567 mp = NULL;
3568
3569 error = kstrgetmsg(SOTOV(so), &mp, uiop,
3570 &pri, &pflag, (clock_t)-1, &rval);
3571 ASSERT(uiop->uio_resid == saved_resid);
3572
3573 if (error) {
3574 #ifdef SOCK_DEBUG
3575 if (error != EWOULDBLOCK && error != EINTR) {
3576 eprintsoline(so, error);
3577 }
3578 #endif /* SOCK_DEBUG */
3579 goto out;
3580 }
3581 ASSERT(mp);
3582 tpr = (union T_primitives *)mp->b_rptr;
3583 ASSERT(tpr->type == T_EXDATA_IND);
3584 freemsg(mp);
3585 } /* end "if (flags & MSG_PEEK)" */
3586
3587 /*
3588 * Decrement the number of queued and pending oob.
3589 *
3590 * SS_RCVATMARK is cleared when we read past a mark.
3591 * SS_HAVEOOBDATA is cleared when we've read past the
3592 * last mark.
3593 * SS_OOBPEND is cleared if we've read past the last
3594 * mark and no (new) SIGURG has been posted.
3595 */
3596 mutex_enter(&so->so_lock);
3597 ASSERT(so_verify_oobstate(so));
3598 ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3599 ASSERT(sti->sti_oobsigcnt > 0);
3600 sti->sti_oobsigcnt--;
3601 ASSERT(sti->sti_oobcnt > 0);
3602 sti->sti_oobcnt--;
3603 /*
3604 * Since the T_EXDATA_IND has been removed from the stream
3605 * head, but we have not read data past the mark,
3606 * sockfs needs to track that the socket is still at the mark.
3607 *
3608 * Since no data was received call kstrgetmsg again to wait
3609 * for data.
3610 */
3611 so->so_state |= SS_RCVATMARK;
3612 mutex_exit(&so->so_lock);
3613 dprintso(so, 1,
3614 ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3615 sti->sti_oobsigcnt, sti->sti_oobcnt,
3616 pr_state(so->so_state, so->so_mode)));
3617 pflag = opflag;
3618 goto retry;
3619 }
3620 default:
3621 cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3622 (void *)so, tpr->type, (void *)mp);
3623 ASSERT(0);
3624 freemsg(mp);
3625 error = EPROTO;
3626 eprintsoline(so, error);
3627 goto out;
3628 }
3629 /* NOTREACHED */
3630 out:
3631 mutex_enter(&so->so_lock);
3632 out_locked:
3633 so_unlock_read(so); /* Clear SOREADLOCKED */
3634 mutex_exit(&so->so_lock);
3635 return (error);
3636 }
3637
3638 /*
3639 * Sending data with options on a datagram socket.
3640 * Assumes caller has verified that SS_ISBOUND etc. are set.
3641 *
3642 * For AF_UNIX the destination address may be already in
3643 * internal form, as indicated by sti->sti_faddr_noxlate
3644 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
3645 * translate the destination address to internal form.
3646 *
3647 * The source address is passed as an option. If passing
3648 * file descriptors, those are passed as file pointers in
3649 * another option.
3650 */
3651 static int
sosend_dgramcmsg(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,void * control,t_uscalar_t controllen,int flags)3652 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3653 struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3654 {
3655 struct T_unitdata_req tudr;
3656 mblk_t *mp;
3657 int error;
3658 void *addr;
3659 socklen_t addrlen;
3660 void *src;
3661 socklen_t srclen;
3662 ssize_t len;
3663 int size;
3664 struct T_opthdr toh;
3665 struct fdbuf *fdbuf;
3666 t_uscalar_t optlen;
3667 void *fds;
3668 int fdlen;
3669 sotpi_info_t *sti = SOTOTPI(so);
3670
3671 ASSERT(name && namelen);
3672 ASSERT(control && controllen);
3673
3674 len = uiop->uio_resid;
3675 if (len > (ssize_t)sti->sti_tidu_size) {
3676 return (EMSGSIZE);
3677 }
3678
3679 if (sti->sti_faddr_noxlate == 0 &&
3680 (flags & MSG_SENDTO_NOXLATE) == 0) {
3681 /*
3682 * Length and family checks.
3683 * Don't verify internal form.
3684 */
3685 error = so_addr_verify(so, name, namelen);
3686 if (error) {
3687 eprintsoline(so, error);
3688 return (error);
3689 }
3690 }
3691
3692 if (so->so_family == AF_UNIX) {
3693 if (sti->sti_faddr_noxlate) {
3694 /*
3695 * Already have a transport internal address. Do not
3696 * pass any (transport internal) source address.
3697 */
3698 addr = name;
3699 addrlen = namelen;
3700 src = NULL;
3701 srclen = 0;
3702 } else if (flags & MSG_SENDTO_NOXLATE) {
3703 /*
3704 * Have an internal form dest. address.
3705 * Pass the source address as usual.
3706 */
3707 addr = name;
3708 addrlen = namelen;
3709 src = sti->sti_laddr_sa;
3710 srclen = (socklen_t)sti->sti_laddr_len;
3711 } else {
3712 /*
3713 * Pass the sockaddr_un source address as an option
3714 * and translate the remote address.
3715 *
3716 * Note that this code does not prevent sti_laddr_sa
3717 * from changing while it is being used. Thus
3718 * if an unbind+bind occurs concurrently with this
3719 * send the peer might see a partially new and a
3720 * partially old "from" address.
3721 */
3722 src = sti->sti_laddr_sa;
3723 srclen = (socklen_t)sti->sti_laddr_len;
3724 dprintso(so, 1,
3725 ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3726 srclen, src));
3727 /*
3728 * The sendmsg caller specified a destination
3729 * address, which we must translate into our
3730 * internal form. addr = &sti->sti_ux_taddr
3731 */
3732 error = so_ux_addr_xlate(so, name, namelen,
3733 (flags & MSG_XPG4_2),
3734 &addr, &addrlen);
3735 if (error) {
3736 eprintsoline(so, error);
3737 return (error);
3738 }
3739 }
3740 } else {
3741 addr = name;
3742 addrlen = namelen;
3743 src = NULL;
3744 srclen = 0;
3745 }
3746 optlen = so_optlen(control, controllen,
3747 !(flags & MSG_XPG4_2));
3748 tudr.PRIM_type = T_UNITDATA_REQ;
3749 tudr.DEST_length = addrlen;
3750 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3751 if (srclen != 0)
3752 tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3753 _TPI_ALIGN_TOPT(srclen));
3754 else
3755 tudr.OPT_length = optlen;
3756 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3757 _TPI_ALIGN_TOPT(addrlen));
3758
3759 size = tudr.OPT_offset + tudr.OPT_length;
3760
3761 /*
3762 * File descriptors only when SM_FDPASSING set.
3763 */
3764 error = so_getfdopt(control, controllen,
3765 !(flags & MSG_XPG4_2), &fds, &fdlen);
3766 if (error)
3767 return (error);
3768 if (fdlen != -1) {
3769 if (!(so->so_mode & SM_FDPASSING))
3770 return (EOPNOTSUPP);
3771
3772 error = fdbuf_create(fds, fdlen, &fdbuf);
3773 if (error)
3774 return (error);
3775 mp = fdbuf_allocmsg(size, fdbuf);
3776 } else {
3777 mp = soallocproto(size, _ALLOC_INTR, CRED());
3778 if (mp == NULL) {
3779 /*
3780 * Caught a signal waiting for memory.
3781 * Let send* return EINTR.
3782 */
3783 return (EINTR);
3784 }
3785 }
3786 soappendmsg(mp, &tudr, sizeof (tudr));
3787 soappendmsg(mp, addr, addrlen);
3788 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3789
3790 if (fdlen != -1) {
3791 ASSERT(fdbuf != NULL);
3792 toh.level = SOL_SOCKET;
3793 toh.name = SO_FILEP;
3794 toh.len = fdbuf->fd_size +
3795 (t_uscalar_t)sizeof (struct T_opthdr);
3796 toh.status = 0;
3797 soappendmsg(mp, &toh, sizeof (toh));
3798 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3799 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3800 }
3801 if (srclen != 0) {
3802 /*
3803 * There is a AF_UNIX sockaddr_un to include as a source
3804 * address option.
3805 */
3806 toh.level = SOL_SOCKET;
3807 toh.name = SO_SRCADDR;
3808 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3809 toh.status = 0;
3810 soappendmsg(mp, &toh, sizeof (toh));
3811 soappendmsg(mp, src, srclen);
3812 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3813 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3814 }
3815 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3816 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3817 /* At most 3 bytes left in the message */
3818 ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3819 ASSERT(MBLKL(mp) <= (ssize_t)size);
3820
3821 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3822 if (AU_AUDITING())
3823 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3824
3825 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3826 #ifdef SOCK_DEBUG
3827 if (error) {
3828 eprintsoline(so, error);
3829 }
3830 #endif /* SOCK_DEBUG */
3831 return (error);
3832 }
3833
3834 /*
3835 * Sending data with options on a connected stream socket.
3836 * Assumes caller has verified that SS_ISCONNECTED is set.
3837 */
3838 static int
sosend_svccmsg(struct sonode * so,struct uio * uiop,int more,void * control,t_uscalar_t controllen,int flags)3839 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3840 t_uscalar_t controllen, int flags)
3841 {
3842 struct T_optdata_req tdr;
3843 mblk_t *mp;
3844 int error;
3845 ssize_t iosize;
3846 int size;
3847 struct fdbuf *fdbuf;
3848 t_uscalar_t optlen;
3849 void *fds;
3850 int fdlen;
3851 struct T_opthdr toh;
3852 sotpi_info_t *sti = SOTOTPI(so);
3853
3854 dprintso(so, 1,
3855 ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3856
3857 /*
3858 * Has to be bound and connected. However, since no locks are
3859 * held the state could have changed after sotpi_sendmsg checked it
3860 * thus it is not possible to ASSERT on the state.
3861 */
3862
3863 /* Options on connection-oriented only when SM_OPTDATA set. */
3864 if (!(so->so_mode & SM_OPTDATA))
3865 return (EOPNOTSUPP);
3866
3867 do {
3868 /*
3869 * Set the MORE flag if uio_resid does not fit in this
3870 * message or if the caller passed in "more".
3871 * Error for transports with zero tidu_size.
3872 */
3873 tdr.PRIM_type = T_OPTDATA_REQ;
3874 iosize = sti->sti_tidu_size;
3875 if (iosize <= 0)
3876 return (EMSGSIZE);
3877 if (uiop->uio_resid > iosize) {
3878 tdr.DATA_flag = 1;
3879 } else {
3880 if (more)
3881 tdr.DATA_flag = 1;
3882 else
3883 tdr.DATA_flag = 0;
3884 iosize = uiop->uio_resid;
3885 }
3886 dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3887 tdr.DATA_flag, iosize));
3888
3889 optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3890 tdr.OPT_length = optlen;
3891 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3892
3893 size = (int)sizeof (tdr) + optlen;
3894 /*
3895 * File descriptors only when SM_FDPASSING set.
3896 */
3897 error = so_getfdopt(control, controllen,
3898 !(flags & MSG_XPG4_2), &fds, &fdlen);
3899 if (error)
3900 return (error);
3901 if (fdlen != -1) {
3902 if (!(so->so_mode & SM_FDPASSING))
3903 return (EOPNOTSUPP);
3904
3905 error = fdbuf_create(fds, fdlen, &fdbuf);
3906 if (error)
3907 return (error);
3908 mp = fdbuf_allocmsg(size, fdbuf);
3909 } else {
3910 mp = soallocproto(size, _ALLOC_INTR, CRED());
3911 if (mp == NULL) {
3912 /*
3913 * Caught a signal waiting for memory.
3914 * Let send* return EINTR.
3915 */
3916 return (EINTR);
3917 }
3918 }
3919 soappendmsg(mp, &tdr, sizeof (tdr));
3920
3921 if (fdlen != -1) {
3922 ASSERT(fdbuf != NULL);
3923 toh.level = SOL_SOCKET;
3924 toh.name = SO_FILEP;
3925 toh.len = fdbuf->fd_size +
3926 (t_uscalar_t)sizeof (struct T_opthdr);
3927 toh.status = 0;
3928 soappendmsg(mp, &toh, sizeof (toh));
3929 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3930 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3931 }
3932 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3933 /* At most 3 bytes left in the message */
3934 ASSERT(MBLKL(mp) > (ssize_t)(size - __TPI_ALIGN_SIZE));
3935 ASSERT(MBLKL(mp) <= (ssize_t)size);
3936
3937 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3938
3939 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3940 0, MSG_BAND, 0);
3941 if (error) {
3942 eprintsoline(so, error);
3943 return (error);
3944 }
3945 control = NULL;
3946 if (uiop->uio_resid > 0) {
3947 /*
3948 * Recheck for fatal errors. Fail write even though
3949 * some data have been written. This is consistent
3950 * with strwrite semantics and BSD sockets semantics.
3951 */
3952 if (so->so_state & SS_CANTSENDMORE) {
3953 eprintsoline(so, error);
3954 return (EPIPE);
3955 }
3956 if (so->so_error != 0) {
3957 mutex_enter(&so->so_lock);
3958 error = sogeterr(so, B_TRUE);
3959 mutex_exit(&so->so_lock);
3960 if (error != 0) {
3961 eprintsoline(so, error);
3962 return (error);
3963 }
3964 }
3965 }
3966 } while (uiop->uio_resid > 0);
3967 return (0);
3968 }
3969
3970 /*
3971 * Sending data on a datagram socket.
3972 * Assumes caller has verified that SS_ISBOUND etc. are set.
3973 *
3974 * For AF_UNIX the destination address may be already in
3975 * internal form, as indicated by sti->sti_faddr_noxlate
3976 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
3977 * translate the destination address to internal form.
3978 *
3979 * The source address is passed as an option.
3980 */
3981 int
sosend_dgram(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)3982 sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3983 struct uio *uiop, int flags)
3984 {
3985 struct T_unitdata_req tudr;
3986 mblk_t *mp;
3987 int error;
3988 void *addr;
3989 socklen_t addrlen;
3990 void *src;
3991 socklen_t srclen;
3992 ssize_t len;
3993 sotpi_info_t *sti = SOTOTPI(so);
3994
3995 ASSERT(name != NULL && namelen != 0);
3996
3997 len = uiop->uio_resid;
3998 if (len > sti->sti_tidu_size) {
3999 error = EMSGSIZE;
4000 goto done;
4001 }
4002
4003 if (sti->sti_faddr_noxlate == 0 &&
4004 (flags & MSG_SENDTO_NOXLATE) == 0) {
4005 /*
4006 * Length and family checks.
4007 * Don't verify internal form.
4008 */
4009 error = so_addr_verify(so, name, namelen);
4010 if (error != 0)
4011 goto done;
4012 }
4013
4014 if (sti->sti_direct) /* Never on AF_UNIX */
4015 return (sodgram_direct(so, name, namelen, uiop, flags));
4016
4017 if (so->so_family == AF_UNIX) {
4018 if (sti->sti_faddr_noxlate) {
4019 /*
4020 * Already have a transport internal address. Do not
4021 * pass any (transport internal) source address.
4022 */
4023 addr = name;
4024 addrlen = namelen;
4025 src = NULL;
4026 srclen = 0;
4027 } else if (flags & MSG_SENDTO_NOXLATE) {
4028 /*
4029 * Have an internal form dest. address.
4030 * Pass the source address as usual.
4031 */
4032 addr = name;
4033 addrlen = namelen;
4034 src = sti->sti_laddr_sa;
4035 srclen = (socklen_t)sti->sti_laddr_len;
4036 } else {
4037 /*
4038 * Pass the sockaddr_un source address as an option
4039 * and translate the remote address.
4040 *
4041 * Note that this code does not prevent sti_laddr_sa
4042 * from changing while it is being used. Thus
4043 * if an unbind+bind occurs concurrently with this
4044 * send the peer might see a partially new and a
4045 * partially old "from" address.
4046 */
4047 src = sti->sti_laddr_sa;
4048 srclen = (socklen_t)sti->sti_laddr_len;
4049 dprintso(so, 1,
4050 ("sosend_dgram UNIX: srclen %d, src %p\n",
4051 srclen, src));
4052 /*
4053 * The sendmsg caller specified a destination
4054 * address, which we must translate into our
4055 * internal form. addr = &sti->sti_ux_taddr
4056 */
4057 error = so_ux_addr_xlate(so, name, namelen,
4058 (flags & MSG_XPG4_2),
4059 &addr, &addrlen);
4060 if (error) {
4061 eprintsoline(so, error);
4062 goto done;
4063 }
4064 }
4065 } else {
4066 addr = name;
4067 addrlen = namelen;
4068 src = NULL;
4069 srclen = 0;
4070 }
4071 tudr.PRIM_type = T_UNITDATA_REQ;
4072 tudr.DEST_length = addrlen;
4073 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4074 if (srclen == 0) {
4075 tudr.OPT_length = 0;
4076 tudr.OPT_offset = 0;
4077
4078 mp = soallocproto2(&tudr, sizeof (tudr),
4079 addr, addrlen, 0, _ALLOC_INTR, CRED());
4080 if (mp == NULL) {
4081 /*
4082 * Caught a signal waiting for memory.
4083 * Let send* return EINTR.
4084 */
4085 error = EINTR;
4086 goto done;
4087 }
4088 } else {
4089 /*
4090 * There is a AF_UNIX sockaddr_un to include as a source
4091 * address option.
4092 */
4093 struct T_opthdr toh;
4094 ssize_t size;
4095
4096 tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
4097 _TPI_ALIGN_TOPT(srclen));
4098 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
4099 _TPI_ALIGN_TOPT(addrlen));
4100
4101 toh.level = SOL_SOCKET;
4102 toh.name = SO_SRCADDR;
4103 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
4104 toh.status = 0;
4105
4106 size = tudr.OPT_offset + tudr.OPT_length;
4107 mp = soallocproto2(&tudr, sizeof (tudr),
4108 addr, addrlen, size, _ALLOC_INTR, CRED());
4109 if (mp == NULL) {
4110 /*
4111 * Caught a signal waiting for memory.
4112 * Let send* return EINTR.
4113 */
4114 error = EINTR;
4115 goto done;
4116 }
4117 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
4118 soappendmsg(mp, &toh, sizeof (toh));
4119 soappendmsg(mp, src, srclen);
4120 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
4121 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
4122 }
4123
4124 if (AU_AUDITING())
4125 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4126
4127 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4128 done:
4129 #ifdef SOCK_DEBUG
4130 if (error) {
4131 eprintsoline(so, error);
4132 }
4133 #endif /* SOCK_DEBUG */
4134 return (error);
4135 }
4136
4137 /*
4138 * Sending data on a connected stream socket.
4139 * Assumes caller has verified that SS_ISCONNECTED is set.
4140 */
4141 int
sosend_svc(struct sonode * so,struct uio * uiop,t_scalar_t prim,int more,int sflag)4142 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
4143 int sflag)
4144 {
4145 struct T_data_req tdr;
4146 mblk_t *mp;
4147 int error;
4148 ssize_t iosize;
4149 sotpi_info_t *sti = SOTOTPI(so);
4150
4151 dprintso(so, 1,
4152 ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
4153 (void *)so, uiop->uio_resid, prim, sflag));
4154
4155 /*
4156 * Has to be bound and connected. However, since no locks are
4157 * held the state could have changed after sotpi_sendmsg checked it
4158 * thus it is not possible to ASSERT on the state.
4159 */
4160
4161 do {
4162 /*
4163 * Set the MORE flag if uio_resid does not fit in this
4164 * message or if the caller passed in "more".
4165 * Error for transports with zero tidu_size.
4166 */
4167 tdr.PRIM_type = prim;
4168 iosize = sti->sti_tidu_size;
4169 if (iosize <= 0)
4170 return (EMSGSIZE);
4171 if (uiop->uio_resid > iosize) {
4172 tdr.MORE_flag = 1;
4173 } else {
4174 if (more)
4175 tdr.MORE_flag = 1;
4176 else
4177 tdr.MORE_flag = 0;
4178 iosize = uiop->uio_resid;
4179 }
4180 dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4181 prim, tdr.MORE_flag, iosize));
4182 mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4183 if (mp == NULL) {
4184 /*
4185 * Caught a signal waiting for memory.
4186 * Let send* return EINTR.
4187 */
4188 return (EINTR);
4189 }
4190
4191 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4192 0, sflag | MSG_BAND, 0);
4193 if (error) {
4194 eprintsoline(so, error);
4195 return (error);
4196 }
4197 if (uiop->uio_resid > 0) {
4198 /*
4199 * Recheck for fatal errors. Fail write even though
4200 * some data have been written. This is consistent
4201 * with strwrite semantics and BSD sockets semantics.
4202 */
4203 if (so->so_state & SS_CANTSENDMORE) {
4204 eprintsoline(so, error);
4205 return (EPIPE);
4206 }
4207 if (so->so_error != 0) {
4208 mutex_enter(&so->so_lock);
4209 error = sogeterr(so, B_TRUE);
4210 mutex_exit(&so->so_lock);
4211 if (error != 0) {
4212 eprintsoline(so, error);
4213 return (error);
4214 }
4215 }
4216 }
4217 } while (uiop->uio_resid > 0);
4218 return (0);
4219 }
4220
4221 /*
4222 * Check the state for errors and call the appropriate send function.
4223 *
4224 * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4225 * this function issues a setsockopt to toggle SO_DONTROUTE before and
4226 * after sending the message.
4227 *
4228 * The caller may optionally specify a destination address, for either
4229 * stream or datagram sockets. This table summarizes the cases:
4230 *
4231 * Socket type Dest. given Connected Result
4232 * ----------- ----------- --------- --------------
4233 * Stream * Yes send to conn. addr.
4234 * Stream * No error ENOTCONN
4235 * Dgram yes * send to given addr.
4236 * Dgram no yes send to conn. addr.
4237 * Dgram no no error EDESTADDRREQ
4238 *
4239 * There are subtleties around the destination address when using
4240 * AF_UNIX datagram sockets. When the sendmsg call specifies the
4241 * destination address, it's in (struct sockaddr_un) form and we
4242 * need to translate it to our internal form (struct so_ux_addr).
4243 *
4244 * When the sendmsg call does not specify a destination address
4245 * we're using the peer address saved during sotpi_connect, and
4246 * that address is already in internal form. In this case, the
4247 * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4248 * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4249 * those functions should skip translation to internal form.
4250 * Avoiding that translation is not only more efficient, but it's
4251 * also necessary when a process does a connect on an AF_UNIX
4252 * datagram socket and then drops privileges. After the process
4253 * has dropped privileges, it may no longer be able to lookup the
4254 * the external name in the filesystem, but it should still be
4255 * able to send messages on the connected socket by leaving the
4256 * destination name unspecified.
4257 *
4258 * Yet more subtleties arise with sockets connected by socketpair(),
4259 * which puts internal form addresses in the fields where normally
4260 * the external form is found, and sets sti_faddr_noxlate=1, which
4261 * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4262 * to skip translation of destination addresses to internal form.
4263 * However, beware that the flag sti_faddr_noxlate=1 also triggers
4264 * different behaviour almost everywhere AF_UNIX addresses appear.
4265 */
4266 static int
sotpi_sendmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)4267 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4268 struct cred *cr)
4269 {
4270 int so_state;
4271 int so_mode;
4272 int error;
4273 struct sockaddr *name;
4274 t_uscalar_t namelen;
4275 int dontroute;
4276 int flags;
4277 sotpi_info_t *sti = SOTOTPI(so);
4278
4279 dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4280 (void *)so, (void *)msg, msg->msg_flags,
4281 pr_state(so->so_state, so->so_mode), so->so_error));
4282
4283 if (so->so_version == SOV_STREAM) {
4284 /* The imaginary "sockmod" has been popped - act as a stream */
4285 so_update_attrs(so, SOMOD);
4286 return (strwrite(SOTOV(so), uiop, cr));
4287 }
4288
4289 mutex_enter(&so->so_lock);
4290 so_state = so->so_state;
4291
4292 if (so_state & SS_CANTSENDMORE) {
4293 mutex_exit(&so->so_lock);
4294 return (EPIPE);
4295 }
4296
4297 if (so->so_error != 0) {
4298 error = sogeterr(so, B_TRUE);
4299 if (error != 0) {
4300 mutex_exit(&so->so_lock);
4301 return (error);
4302 }
4303 }
4304
4305 name = (struct sockaddr *)msg->msg_name;
4306 namelen = msg->msg_namelen;
4307 flags = msg->msg_flags;
4308
4309 /*
4310 * Historically, this function does not validate the flags
4311 * passed in, and any errant bits are ignored. However,
4312 * we would not want any such errant flag bits accidently
4313 * being treated as one of the internal-only flags, so
4314 * clear the internal-only flag bits.
4315 */
4316 flags &= ~MSG_SENDTO_NOXLATE;
4317
4318 so_mode = so->so_mode;
4319
4320 if (name == NULL) {
4321 if (!(so_state & SS_ISCONNECTED)) {
4322 mutex_exit(&so->so_lock);
4323 if (so_mode & SM_CONNREQUIRED)
4324 return (ENOTCONN);
4325 else
4326 return (EDESTADDRREQ);
4327 }
4328 /*
4329 * This is a connected socket.
4330 */
4331 if (so_mode & SM_CONNREQUIRED) {
4332 /*
4333 * This is a connected STREAM socket,
4334 * destination not specified.
4335 */
4336 name = NULL;
4337 namelen = 0;
4338 } else {
4339 /*
4340 * Datagram send on connected socket with
4341 * the destination name not specified.
4342 * Use the peer address from connect.
4343 */
4344 if (so->so_family == AF_UNIX) {
4345 /*
4346 * Use the (internal form) address saved
4347 * in sotpi_connect. See above.
4348 */
4349 name = (void *)&sti->sti_ux_faddr;
4350 namelen = sizeof (sti->sti_ux_faddr);
4351 flags |= MSG_SENDTO_NOXLATE;
4352 } else {
4353 ASSERT(sti->sti_faddr_sa);
4354 name = sti->sti_faddr_sa;
4355 namelen = (t_uscalar_t)sti->sti_faddr_len;
4356 }
4357 }
4358 } else {
4359 /*
4360 * Sendmsg specifies a destination name
4361 */
4362 if (!(so_state & SS_ISCONNECTED) &&
4363 (so_mode & SM_CONNREQUIRED)) {
4364 /* i.e. TCP not connected */
4365 mutex_exit(&so->so_lock);
4366 return (ENOTCONN);
4367 }
4368 /*
4369 * Ignore the address on connection-oriented sockets.
4370 * Just like BSD this code does not generate an error for
4371 * TCP (a CONNREQUIRED socket) when sending to an address
4372 * passed in with sendto/sendmsg. Instead the data is
4373 * delivered on the connection as if no address had been
4374 * supplied.
4375 */
4376 if ((so_state & SS_ISCONNECTED) &&
4377 !(so_mode & SM_CONNREQUIRED)) {
4378 mutex_exit(&so->so_lock);
4379 return (EISCONN);
4380 }
4381 if (!(so_state & SS_ISBOUND)) {
4382 so_lock_single(so); /* Set SOLOCKED */
4383 error = sotpi_bind(so, NULL, 0,
4384 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4385 so_unlock_single(so, SOLOCKED);
4386 if (error) {
4387 mutex_exit(&so->so_lock);
4388 eprintsoline(so, error);
4389 return (error);
4390 }
4391 }
4392 /*
4393 * Handle delayed datagram errors. These are only queued
4394 * when the application sets SO_DGRAM_ERRIND.
4395 * Return the error if we are sending to the address
4396 * that was returned in the last T_UDERROR_IND.
4397 * If sending to some other address discard the delayed
4398 * error indication.
4399 */
4400 if (sti->sti_delayed_error) {
4401 struct T_uderror_ind *tudi;
4402 void *addr;
4403 t_uscalar_t addrlen;
4404 boolean_t match = B_FALSE;
4405
4406 ASSERT(sti->sti_eaddr_mp);
4407 error = sti->sti_delayed_error;
4408 sti->sti_delayed_error = 0;
4409 tudi =
4410 (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4411 addrlen = tudi->DEST_length;
4412 addr = sogetoff(sti->sti_eaddr_mp,
4413 tudi->DEST_offset, addrlen, 1);
4414 ASSERT(addr); /* Checked by strsock_proto */
4415 switch (so->so_family) {
4416 case AF_INET: {
4417 /* Compare just IP address and port */
4418 sin_t *sin1 = (sin_t *)name;
4419 sin_t *sin2 = (sin_t *)addr;
4420
4421 if (addrlen == sizeof (sin_t) &&
4422 namelen == addrlen &&
4423 sin1->sin_port == sin2->sin_port &&
4424 sin1->sin_addr.s_addr ==
4425 sin2->sin_addr.s_addr)
4426 match = B_TRUE;
4427 break;
4428 }
4429 case AF_INET6: {
4430 /* Compare just IP address and port. Not flow */
4431 sin6_t *sin1 = (sin6_t *)name;
4432 sin6_t *sin2 = (sin6_t *)addr;
4433
4434 if (addrlen == sizeof (sin6_t) &&
4435 namelen == addrlen &&
4436 sin1->sin6_port == sin2->sin6_port &&
4437 IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4438 &sin2->sin6_addr))
4439 match = B_TRUE;
4440 break;
4441 }
4442 case AF_UNIX:
4443 default:
4444 if (namelen == addrlen &&
4445 bcmp(name, addr, namelen) == 0)
4446 match = B_TRUE;
4447 }
4448 if (match) {
4449 freemsg(sti->sti_eaddr_mp);
4450 sti->sti_eaddr_mp = NULL;
4451 mutex_exit(&so->so_lock);
4452 #ifdef DEBUG
4453 dprintso(so, 0,
4454 ("sockfs delayed error %d for %s\n",
4455 error,
4456 pr_addr(so->so_family, name, namelen)));
4457 #endif /* DEBUG */
4458 return (error);
4459 }
4460 freemsg(sti->sti_eaddr_mp);
4461 sti->sti_eaddr_mp = NULL;
4462 }
4463 }
4464 mutex_exit(&so->so_lock);
4465
4466 dontroute = 0;
4467 if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4468 uint32_t val;
4469
4470 val = 1;
4471 error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4472 &val, (t_uscalar_t)sizeof (val), cr);
4473 if (error)
4474 return (error);
4475 dontroute = 1;
4476 }
4477
4478 if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4479 error = EOPNOTSUPP;
4480 goto done;
4481 }
4482 if (msg->msg_controllen != 0) {
4483 if (!(so_mode & SM_CONNREQUIRED)) {
4484 so_update_attrs(so, SOMOD);
4485 error = sosend_dgramcmsg(so, name, namelen, uiop,
4486 msg->msg_control, msg->msg_controllen, flags);
4487 } else {
4488 if (flags & MSG_OOB) {
4489 /* Can't generate T_EXDATA_REQ with options */
4490 error = EOPNOTSUPP;
4491 goto done;
4492 }
4493 so_update_attrs(so, SOMOD);
4494 error = sosend_svccmsg(so, uiop,
4495 !(flags & MSG_EOR),
4496 msg->msg_control, msg->msg_controllen,
4497 flags);
4498 }
4499 goto done;
4500 }
4501
4502 so_update_attrs(so, SOMOD);
4503 if (!(so_mode & SM_CONNREQUIRED)) {
4504 /*
4505 * If there is no SO_DONTROUTE to turn off return immediately
4506 * from send_dgram. This can allow tail-call optimizations.
4507 */
4508 if (!dontroute) {
4509 return (sosend_dgram(so, name, namelen, uiop, flags));
4510 }
4511 error = sosend_dgram(so, name, namelen, uiop, flags);
4512 } else {
4513 t_scalar_t prim;
4514 int sflag;
4515
4516 /* Ignore msg_name in the connected state */
4517 if (flags & MSG_OOB) {
4518 prim = T_EXDATA_REQ;
4519 /*
4520 * Send down T_EXDATA_REQ even if there is flow
4521 * control for data.
4522 */
4523 sflag = MSG_IGNFLOW;
4524 } else {
4525 if (so_mode & SM_BYTESTREAM) {
4526 /* Byte stream transport - use write */
4527 dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4528
4529 /* Send M_DATA messages */
4530 if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
4531 (error = nl7c_data(so, uiop)) >= 0) {
4532 /* NL7C consumed the data */
4533 return (error);
4534 }
4535 /*
4536 * If there is no SO_DONTROUTE to turn off,
4537 * sti_direct is on, and there is no flow
4538 * control, we can take the fast path.
4539 */
4540 if (!dontroute && sti->sti_direct != 0 &&
4541 canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4542 return (sostream_direct(so, uiop,
4543 NULL, cr));
4544 }
4545 error = strwrite(SOTOV(so), uiop, cr);
4546 goto done;
4547 }
4548 prim = T_DATA_REQ;
4549 sflag = 0;
4550 }
4551 /*
4552 * If there is no SO_DONTROUTE to turn off return immediately
4553 * from sosend_svc. This can allow tail-call optimizations.
4554 */
4555 if (!dontroute)
4556 return (sosend_svc(so, uiop, prim,
4557 !(flags & MSG_EOR), sflag));
4558 error = sosend_svc(so, uiop, prim,
4559 !(flags & MSG_EOR), sflag);
4560 }
4561 ASSERT(dontroute);
4562 done:
4563 if (dontroute) {
4564 uint32_t val;
4565
4566 val = 0;
4567 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4568 &val, (t_uscalar_t)sizeof (val), cr);
4569 }
4570 return (error);
4571 }
4572
4573 /*
4574 * kstrwritemp() has very similar semantics as that of strwrite().
4575 * The main difference is it obtains mblks from the caller and also
4576 * does not do any copy as done in strwrite() from user buffers to
4577 * kernel buffers.
4578 *
4579 * Currently, this routine is used by sendfile to send data allocated
4580 * within the kernel without any copying. This interface does not use the
4581 * synchronous stream interface as synch. stream interface implies
4582 * copying.
4583 */
4584 int
kstrwritemp(struct vnode * vp,mblk_t * mp,ushort_t fmode)4585 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4586 {
4587 struct stdata *stp;
4588 struct queue *wqp;
4589 mblk_t *newmp;
4590 char waitflag;
4591 int tempmode;
4592 int error = 0;
4593 int done = 0;
4594 struct sonode *so;
4595 boolean_t direct;
4596
4597 ASSERT(vp->v_stream);
4598 stp = vp->v_stream;
4599
4600 so = VTOSO(vp);
4601 direct = _SOTOTPI(so)->sti_direct;
4602
4603 /*
4604 * This is the sockfs direct fast path. canputnext() need
4605 * not be accurate so we don't grab the sd_lock here. If
4606 * we get flow-controlled, we grab sd_lock just before the
4607 * do..while loop below to emulate what strwrite() does.
4608 */
4609 wqp = stp->sd_wrq;
4610 if (canputnext(wqp) && direct &&
4611 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4612 return (sostream_direct(so, NULL, mp, CRED()));
4613 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4614 /* Fast check of flags before acquiring the lock */
4615 mutex_enter(&stp->sd_lock);
4616 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4617 mutex_exit(&stp->sd_lock);
4618 if (error != 0) {
4619 if (!(stp->sd_flag & STPLEX) &&
4620 (stp->sd_wput_opt & SW_SIGPIPE)) {
4621 error = EPIPE;
4622 }
4623 return (error);
4624 }
4625 }
4626
4627 waitflag = WRITEWAIT;
4628 if (stp->sd_flag & OLDNDELAY)
4629 tempmode = fmode & ~FNDELAY;
4630 else
4631 tempmode = fmode;
4632
4633 mutex_enter(&stp->sd_lock);
4634 do {
4635 if (canputnext(wqp)) {
4636 mutex_exit(&stp->sd_lock);
4637 if (stp->sd_wputdatafunc != NULL) {
4638 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4639 NULL, NULL, NULL);
4640 if (newmp == NULL) {
4641 /* The caller will free mp */
4642 return (ECOMM);
4643 }
4644 mp = newmp;
4645 }
4646 putnext(wqp, mp);
4647 return (0);
4648 }
4649 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4650 &done);
4651 } while (error == 0 && !done);
4652
4653 mutex_exit(&stp->sd_lock);
4654 /*
4655 * EAGAIN tells the application to try again. ENOMEM
4656 * is returned only if the memory allocation size
4657 * exceeds the physical limits of the system. ENOMEM
4658 * can't be true here.
4659 */
4660 if (error == ENOMEM)
4661 error = EAGAIN;
4662 return (error);
4663 }
4664
4665 /* ARGSUSED */
4666 static int
sotpi_sendmblk(struct sonode * so,struct nmsghdr * msg,int fflag,struct cred * cr,mblk_t ** mpp)4667 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4668 struct cred *cr, mblk_t **mpp)
4669 {
4670 int error;
4671
4672 if (so->so_family != AF_INET && so->so_family != AF_INET6)
4673 return (EAFNOSUPPORT);
4674
4675 if (so->so_state & SS_CANTSENDMORE)
4676 return (EPIPE);
4677
4678 if (so->so_type != SOCK_STREAM)
4679 return (EOPNOTSUPP);
4680
4681 if ((so->so_state & SS_ISCONNECTED) == 0)
4682 return (ENOTCONN);
4683
4684 error = kstrwritemp(so->so_vnode, *mpp, fflag);
4685 if (error == 0)
4686 *mpp = NULL;
4687 return (error);
4688 }
4689
4690 /*
4691 * Sending data on a datagram socket.
4692 * Assumes caller has verified that SS_ISBOUND etc. are set.
4693 */
4694 /* ARGSUSED */
4695 static int
sodgram_direct(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)4696 sodgram_direct(struct sonode *so, struct sockaddr *name,
4697 socklen_t namelen, struct uio *uiop, int flags)
4698 {
4699 struct T_unitdata_req tudr;
4700 mblk_t *mp = NULL;
4701 int error = 0;
4702 void *addr;
4703 socklen_t addrlen;
4704 ssize_t len;
4705 struct stdata *stp = SOTOV(so)->v_stream;
4706 int so_state;
4707 queue_t *udp_wq;
4708 boolean_t connected;
4709 mblk_t *mpdata = NULL;
4710 sotpi_info_t *sti = SOTOTPI(so);
4711 uint32_t auditing = AU_AUDITING();
4712
4713 ASSERT(name != NULL && namelen != 0);
4714 ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4715 ASSERT(!(so->so_mode & SM_EXDATA));
4716 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4717 ASSERT(SOTOV(so)->v_type == VSOCK);
4718
4719 /* Caller checked for proper length */
4720 len = uiop->uio_resid;
4721 ASSERT(len <= sti->sti_tidu_size);
4722
4723 /* Length and family checks have been done by caller */
4724 ASSERT(name->sa_family == so->so_family);
4725 ASSERT(so->so_family == AF_INET ||
4726 (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4727 ASSERT(so->so_family == AF_INET6 ||
4728 (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4729
4730 addr = name;
4731 addrlen = namelen;
4732
4733 if (stp->sd_sidp != NULL &&
4734 (error = straccess(stp, JCWRITE)) != 0)
4735 goto done;
4736
4737 so_state = so->so_state;
4738
4739 connected = so_state & SS_ISCONNECTED;
4740 if (!connected) {
4741 tudr.PRIM_type = T_UNITDATA_REQ;
4742 tudr.DEST_length = addrlen;
4743 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4744 tudr.OPT_length = 0;
4745 tudr.OPT_offset = 0;
4746
4747 mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4748 _ALLOC_INTR, CRED());
4749 if (mp == NULL) {
4750 /*
4751 * Caught a signal waiting for memory.
4752 * Let send* return EINTR.
4753 */
4754 error = EINTR;
4755 goto done;
4756 }
4757 }
4758
4759 /*
4760 * For UDP we don't break up the copyin into smaller pieces
4761 * as in the TCP case. That means if ENOMEM is returned by
4762 * mcopyinuio() then the uio vector has not been modified at
4763 * all and we fallback to either strwrite() or kstrputmsg()
4764 * below. Note also that we never generate priority messages
4765 * from here.
4766 */
4767 udp_wq = stp->sd_wrq->q_next;
4768 if (canput(udp_wq) &&
4769 (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4770 ASSERT(DB_TYPE(mpdata) == M_DATA);
4771 ASSERT(uiop->uio_resid == 0);
4772 if (!connected)
4773 linkb(mp, mpdata);
4774 else
4775 mp = mpdata;
4776 if (auditing)
4777 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4778
4779 udp_wput(udp_wq, mp);
4780 return (0);
4781 }
4782
4783 ASSERT(mpdata == NULL);
4784 if (error != 0 && error != ENOMEM) {
4785 freemsg(mp);
4786 return (error);
4787 }
4788
4789 /*
4790 * For connected, let strwrite() handle the blocking case.
4791 * Otherwise we fall thru and use kstrputmsg().
4792 */
4793 if (connected)
4794 return (strwrite(SOTOV(so), uiop, CRED()));
4795
4796 if (auditing)
4797 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4798
4799 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4800 done:
4801 #ifdef SOCK_DEBUG
4802 if (error != 0) {
4803 eprintsoline(so, error);
4804 }
4805 #endif /* SOCK_DEBUG */
4806 return (error);
4807 }
4808
4809 int
sostream_direct(struct sonode * so,struct uio * uiop,mblk_t * mp,cred_t * cr)4810 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4811 {
4812 struct stdata *stp = SOTOV(so)->v_stream;
4813 ssize_t iosize, rmax, maxblk;
4814 queue_t *tcp_wq = stp->sd_wrq->q_next;
4815 mblk_t *newmp;
4816 int error = 0, wflag = 0;
4817
4818 ASSERT(so->so_mode & SM_BYTESTREAM);
4819 ASSERT(SOTOV(so)->v_type == VSOCK);
4820
4821 if (stp->sd_sidp != NULL &&
4822 (error = straccess(stp, JCWRITE)) != 0)
4823 return (error);
4824
4825 if (uiop == NULL) {
4826 /*
4827 * kstrwritemp() should have checked sd_flag and
4828 * flow-control before coming here. If we end up
4829 * here it means that we can simply pass down the
4830 * data to tcp.
4831 */
4832 ASSERT(mp != NULL);
4833 if (stp->sd_wputdatafunc != NULL) {
4834 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4835 NULL, NULL, NULL);
4836 if (newmp == NULL) {
4837 /* The caller will free mp */
4838 return (ECOMM);
4839 }
4840 mp = newmp;
4841 }
4842 tcp_wput(tcp_wq, mp);
4843 return (0);
4844 }
4845
4846 /* Fallback to strwrite() to do proper error handling */
4847 if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4848 return (strwrite(SOTOV(so), uiop, cr));
4849
4850 rmax = stp->sd_qn_maxpsz;
4851 ASSERT(rmax >= 0 || rmax == INFPSZ);
4852 if (rmax == 0 || uiop->uio_resid <= 0)
4853 return (0);
4854
4855 if (rmax == INFPSZ)
4856 rmax = uiop->uio_resid;
4857
4858 maxblk = stp->sd_maxblk;
4859
4860 for (;;) {
4861 iosize = MIN(uiop->uio_resid, rmax);
4862
4863 mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4864 if (mp == NULL) {
4865 /*
4866 * Fallback to strwrite() for ENOMEM; if this
4867 * is our first time in this routine and the uio
4868 * vector has not been modified, we will end up
4869 * calling strwrite() without any flag set.
4870 */
4871 if (error == ENOMEM)
4872 goto slow_send;
4873 else
4874 return (error);
4875 }
4876 ASSERT(uiop->uio_resid >= 0);
4877 /*
4878 * If mp is non-NULL and ENOMEM is set, it means that
4879 * mcopyinuio() was able to break down some of the user
4880 * data into one or more mblks. Send the partial data
4881 * to tcp and let the rest be handled in strwrite().
4882 */
4883 ASSERT(error == 0 || error == ENOMEM);
4884 if (stp->sd_wputdatafunc != NULL) {
4885 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4886 NULL, NULL, NULL);
4887 if (newmp == NULL) {
4888 /* The caller will free mp */
4889 return (ECOMM);
4890 }
4891 mp = newmp;
4892 }
4893 tcp_wput(tcp_wq, mp);
4894
4895 wflag |= NOINTR;
4896
4897 if (uiop->uio_resid == 0) { /* No more data; we're done */
4898 ASSERT(error == 0);
4899 break;
4900 } else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4901 (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4902 slow_send:
4903 /*
4904 * We were able to send down partial data using
4905 * the direct call interface, but are now relying
4906 * on strwrite() to handle the non-fastpath cases.
4907 * If the socket is blocking we will sleep in
4908 * strwaitq() until write is permitted, otherwise,
4909 * we will need to return the amount of bytes
4910 * written so far back to the app. This is the
4911 * reason why we pass NOINTR flag to strwrite()
4912 * for non-blocking socket, because we don't want
4913 * to return EAGAIN when portion of the user data
4914 * has actually been sent down.
4915 */
4916 return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4917 }
4918 }
4919 return (0);
4920 }
4921
4922 /*
4923 * Update sti_faddr by asking the transport (unless AF_UNIX).
4924 */
4925 /* ARGSUSED */
4926 int
sotpi_getpeername(struct sonode * so,struct sockaddr * name,socklen_t * namelen,boolean_t accept,struct cred * cr)4927 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4928 boolean_t accept, struct cred *cr)
4929 {
4930 struct strbuf strbuf;
4931 int error = 0, res;
4932 void *addr;
4933 t_uscalar_t addrlen;
4934 k_sigset_t smask;
4935 sotpi_info_t *sti = SOTOTPI(so);
4936
4937 dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4938 (void *)so, pr_state(so->so_state, so->so_mode)));
4939
4940 ASSERT(*namelen > 0);
4941 mutex_enter(&so->so_lock);
4942 so_lock_single(so); /* Set SOLOCKED */
4943
4944 if (accept) {
4945 bcopy(sti->sti_faddr_sa, name,
4946 MIN(*namelen, sti->sti_faddr_len));
4947 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4948 goto done;
4949 }
4950
4951 if (!(so->so_state & SS_ISCONNECTED)) {
4952 error = ENOTCONN;
4953 goto done;
4954 }
4955 /* Added this check for X/Open */
4956 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4957 error = EINVAL;
4958 if (xnet_check_print) {
4959 printf("sockfs: X/Open getpeername check => EINVAL\n");
4960 }
4961 goto done;
4962 }
4963
4964 if (sti->sti_faddr_valid) {
4965 bcopy(sti->sti_faddr_sa, name,
4966 MIN(*namelen, sti->sti_faddr_len));
4967 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4968 goto done;
4969 }
4970
4971 #ifdef DEBUG
4972 dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4973 pr_addr(so->so_family, sti->sti_faddr_sa,
4974 (t_uscalar_t)sti->sti_faddr_len)));
4975 #endif /* DEBUG */
4976
4977 if (so->so_family == AF_UNIX) {
4978 /* Transport has different name space - return local info */
4979 if (sti->sti_faddr_noxlate)
4980 *namelen = 0;
4981 error = 0;
4982 goto done;
4983 }
4984
4985 ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4986
4987 ASSERT(sti->sti_faddr_sa);
4988 /* Allocate local buffer to use with ioctl */
4989 addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4990 mutex_exit(&so->so_lock);
4991 addr = kmem_alloc(addrlen, KM_SLEEP);
4992
4993 /*
4994 * Issue TI_GETPEERNAME with signals masked.
4995 * Put the result in sti_faddr_sa so that getpeername works after
4996 * a shutdown(output).
4997 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4998 * back to the socket.
4999 */
5000 strbuf.buf = addr;
5001 strbuf.maxlen = addrlen;
5002 strbuf.len = 0;
5003
5004 sigintr(&smask, 0);
5005 res = 0;
5006 ASSERT(cr);
5007 error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
5008 0, K_TO_K, cr, &res);
5009 sigunintr(&smask);
5010
5011 mutex_enter(&so->so_lock);
5012 /*
5013 * If there is an error record the error in so_error put don't fail
5014 * the getpeername. Instead fallback on the recorded
5015 * sti->sti_faddr_sa.
5016 */
5017 if (error) {
5018 /*
5019 * Various stream head errors can be returned to the ioctl.
5020 * However, it is impossible to determine which ones of
5021 * these are really socket level errors that were incorrectly
5022 * consumed by the ioctl. Thus this code silently ignores the
5023 * error - to code explicitly does not reinstate the error
5024 * using soseterror().
5025 * Experiments have shows that at least this set of
5026 * errors are reported and should not be reinstated on the
5027 * socket:
5028 * EINVAL E.g. if an I_LINK was in effect when
5029 * getpeername was called.
5030 * EPIPE The ioctl error semantics prefer the write
5031 * side error over the read side error.
5032 * ENOTCONN The transport just got disconnected but
5033 * sockfs had not yet seen the T_DISCON_IND
5034 * when issuing the ioctl.
5035 */
5036 error = 0;
5037 } else if (res == 0 && strbuf.len > 0 &&
5038 (so->so_state & SS_ISCONNECTED)) {
5039 ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
5040 sti->sti_faddr_len = (socklen_t)strbuf.len;
5041 bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
5042 sti->sti_faddr_valid = 1;
5043
5044 bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
5045 *namelen = sti->sti_faddr_len;
5046 }
5047 kmem_free(addr, addrlen);
5048 #ifdef DEBUG
5049 dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
5050 pr_addr(so->so_family, sti->sti_faddr_sa,
5051 (t_uscalar_t)sti->sti_faddr_len)));
5052 #endif /* DEBUG */
5053 done:
5054 so_unlock_single(so, SOLOCKED);
5055 mutex_exit(&so->so_lock);
5056 return (error);
5057 }
5058
5059 /*
5060 * Update sti_laddr by asking the transport (unless AF_UNIX).
5061 */
5062 int
sotpi_getsockname(struct sonode * so,struct sockaddr * name,socklen_t * namelen,struct cred * cr)5063 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
5064 struct cred *cr)
5065 {
5066 struct strbuf strbuf;
5067 int error = 0, res;
5068 void *addr;
5069 t_uscalar_t addrlen;
5070 k_sigset_t smask;
5071 sotpi_info_t *sti = SOTOTPI(so);
5072
5073 dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
5074 (void *)so, pr_state(so->so_state, so->so_mode)));
5075
5076 ASSERT(*namelen > 0);
5077 mutex_enter(&so->so_lock);
5078 so_lock_single(so); /* Set SOLOCKED */
5079
5080 #ifdef DEBUG
5081
5082 dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
5083 pr_addr(so->so_family, sti->sti_laddr_sa,
5084 (t_uscalar_t)sti->sti_laddr_len)));
5085 #endif /* DEBUG */
5086 if (sti->sti_laddr_valid) {
5087 bcopy(sti->sti_laddr_sa, name,
5088 MIN(*namelen, sti->sti_laddr_len));
5089 *namelen = sti->sti_laddr_len;
5090 goto done;
5091 }
5092
5093 if (so->so_family == AF_UNIX) {
5094 /*
5095 * Transport has different name space - return local info. If we
5096 * have enough space, let consumers know the family.
5097 */
5098 if (*namelen >= sizeof (sa_family_t)) {
5099 name->sa_family = AF_UNIX;
5100 *namelen = sizeof (sa_family_t);
5101 } else {
5102 *namelen = 0;
5103 }
5104 error = 0;
5105 goto done;
5106 }
5107 if (!(so->so_state & SS_ISBOUND)) {
5108 /* If not bound, then nothing to return. */
5109 error = 0;
5110 goto done;
5111 }
5112
5113 /* Allocate local buffer to use with ioctl */
5114 addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
5115 mutex_exit(&so->so_lock);
5116 addr = kmem_alloc(addrlen, KM_SLEEP);
5117
5118 /*
5119 * Issue TI_GETMYNAME with signals masked.
5120 * Put the result in sti_laddr_sa so that getsockname works after
5121 * a shutdown(output).
5122 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
5123 * back to the socket.
5124 */
5125 strbuf.buf = addr;
5126 strbuf.maxlen = addrlen;
5127 strbuf.len = 0;
5128
5129 sigintr(&smask, 0);
5130 res = 0;
5131 ASSERT(cr);
5132 error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
5133 0, K_TO_K, cr, &res);
5134 sigunintr(&smask);
5135
5136 mutex_enter(&so->so_lock);
5137 /*
5138 * If there is an error record the error in so_error put don't fail
5139 * the getsockname. Instead fallback on the recorded
5140 * sti->sti_laddr_sa.
5141 */
5142 if (error) {
5143 /*
5144 * Various stream head errors can be returned to the ioctl.
5145 * However, it is impossible to determine which ones of
5146 * these are really socket level errors that were incorrectly
5147 * consumed by the ioctl. Thus this code silently ignores the
5148 * error - to code explicitly does not reinstate the error
5149 * using soseterror().
5150 * Experiments have shows that at least this set of
5151 * errors are reported and should not be reinstated on the
5152 * socket:
5153 * EINVAL E.g. if an I_LINK was in effect when
5154 * getsockname was called.
5155 * EPIPE The ioctl error semantics prefer the write
5156 * side error over the read side error.
5157 */
5158 error = 0;
5159 } else if (res == 0 && strbuf.len > 0 &&
5160 (so->so_state & SS_ISBOUND)) {
5161 ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
5162 sti->sti_laddr_len = (socklen_t)strbuf.len;
5163 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
5164 sti->sti_laddr_valid = 1;
5165
5166 bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
5167 *namelen = sti->sti_laddr_len;
5168 }
5169 kmem_free(addr, addrlen);
5170 #ifdef DEBUG
5171 dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
5172 pr_addr(so->so_family, sti->sti_laddr_sa,
5173 (t_uscalar_t)sti->sti_laddr_len)));
5174 #endif /* DEBUG */
5175 done:
5176 so_unlock_single(so, SOLOCKED);
5177 mutex_exit(&so->so_lock);
5178 return (error);
5179 }
5180
5181 /*
5182 * Get socket options. For SOL_SOCKET options some options are handled
5183 * by the sockfs while others use the value recorded in the sonode as a
5184 * fallback should the T_SVR4_OPTMGMT_REQ fail.
5185 *
5186 * On the return most *optlenp bytes are copied to optval.
5187 */
5188 /* ARGSUSED */
5189 int
sotpi_getsockopt(struct sonode * so,int level,int option_name,void * optval,socklen_t * optlenp,int flags,struct cred * cr)5190 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5191 void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5192 {
5193 struct T_optmgmt_req optmgmt_req;
5194 struct T_optmgmt_ack *optmgmt_ack;
5195 struct opthdr oh;
5196 struct opthdr *opt_res;
5197 mblk_t *mp = NULL;
5198 int error = 0;
5199 void *option = NULL; /* Set if fallback value */
5200 t_uscalar_t maxlen = *optlenp;
5201 t_uscalar_t len;
5202 uint32_t value;
5203 struct timeval tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5204 struct timeval32 tmo_val32;
5205 struct so_snd_bufinfo snd_bufinfo; /* used for zero copy */
5206
5207 dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5208 (void *)so, level, option_name, optval, (void *)optlenp,
5209 pr_state(so->so_state, so->so_mode)));
5210
5211 mutex_enter(&so->so_lock);
5212 so_lock_single(so); /* Set SOLOCKED */
5213
5214 /*
5215 * Check for SOL_SOCKET options.
5216 * Certain SOL_SOCKET options are returned directly whereas
5217 * others only provide a default (fallback) value should
5218 * the T_SVR4_OPTMGMT_REQ fail.
5219 */
5220 if (level == SOL_SOCKET) {
5221 /* Check parameters */
5222 switch (option_name) {
5223 case SO_TYPE:
5224 case SO_ERROR:
5225 case SO_DEBUG:
5226 case SO_ACCEPTCONN:
5227 case SO_REUSEADDR:
5228 case SO_REUSEPORT:
5229 case SO_KEEPALIVE:
5230 case SO_DONTROUTE:
5231 case SO_BROADCAST:
5232 case SO_USELOOPBACK:
5233 case SO_OOBINLINE:
5234 case SO_SNDBUF:
5235 case SO_RCVBUF:
5236 #ifdef notyet
5237 case SO_SNDLOWAT:
5238 case SO_RCVLOWAT:
5239 #endif /* notyet */
5240 case SO_DOMAIN:
5241 case SO_DGRAM_ERRIND:
5242 if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5243 error = EINVAL;
5244 eprintsoline(so, error);
5245 goto done2;
5246 }
5247 break;
5248 case SO_RCVTIMEO:
5249 case SO_SNDTIMEO:
5250 if (get_udatamodel() == DATAMODEL_NONE ||
5251 get_udatamodel() == DATAMODEL_NATIVE) {
5252 if (maxlen < sizeof (struct timeval)) {
5253 error = EINVAL;
5254 eprintsoline(so, error);
5255 goto done2;
5256 }
5257 } else {
5258 if (maxlen < sizeof (struct timeval32)) {
5259 error = EINVAL;
5260 eprintsoline(so, error);
5261 goto done2;
5262 }
5263
5264 }
5265 break;
5266 case SO_LINGER:
5267 if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5268 error = EINVAL;
5269 eprintsoline(so, error);
5270 goto done2;
5271 }
5272 break;
5273 case SO_SND_BUFINFO:
5274 if (maxlen < (t_uscalar_t)
5275 sizeof (struct so_snd_bufinfo)) {
5276 error = EINVAL;
5277 eprintsoline(so, error);
5278 goto done2;
5279 }
5280 break;
5281 }
5282
5283 len = (t_uscalar_t)sizeof (uint32_t); /* Default */
5284
5285 switch (option_name) {
5286 case SO_TYPE:
5287 value = so->so_type;
5288 option = &value;
5289 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5290
5291 case SO_ERROR:
5292 value = sogeterr(so, B_TRUE);
5293 option = &value;
5294 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5295
5296 case SO_ACCEPTCONN:
5297 if (so->so_state & SS_ACCEPTCONN)
5298 value = SO_ACCEPTCONN;
5299 else
5300 value = 0;
5301 #ifdef DEBUG
5302 if (value) {
5303 dprintso(so, 1,
5304 ("sotpi_getsockopt: 0x%x is set\n",
5305 option_name));
5306 } else {
5307 dprintso(so, 1,
5308 ("sotpi_getsockopt: 0x%x not set\n",
5309 option_name));
5310 }
5311 #endif /* DEBUG */
5312 option = &value;
5313 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5314
5315 case SO_DEBUG:
5316 case SO_REUSEADDR:
5317 case SO_REUSEPORT:
5318 case SO_KEEPALIVE:
5319 case SO_DONTROUTE:
5320 case SO_BROADCAST:
5321 case SO_USELOOPBACK:
5322 case SO_OOBINLINE:
5323 case SO_DGRAM_ERRIND:
5324 value = (so->so_options & option_name);
5325 #ifdef DEBUG
5326 if (value) {
5327 dprintso(so, 1,
5328 ("sotpi_getsockopt: 0x%x is set\n",
5329 option_name));
5330 } else {
5331 dprintso(so, 1,
5332 ("sotpi_getsockopt: 0x%x not set\n",
5333 option_name));
5334 }
5335 #endif /* DEBUG */
5336 option = &value;
5337 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5338
5339 /*
5340 * The following options are only returned by sockfs when the
5341 * T_SVR4_OPTMGMT_REQ fails.
5342 */
5343 case SO_LINGER:
5344 option = &so->so_linger;
5345 len = (t_uscalar_t)sizeof (struct linger);
5346 break;
5347 case SO_SNDBUF: {
5348 ssize_t lvalue;
5349
5350 /*
5351 * If the option has not been set then get a default
5352 * value from the read queue. This value is
5353 * returned if the transport fails
5354 * the T_SVR4_OPTMGMT_REQ.
5355 */
5356 lvalue = so->so_sndbuf;
5357 if (lvalue == 0) {
5358 mutex_exit(&so->so_lock);
5359 (void) strqget(strvp2wq(SOTOV(so))->q_next,
5360 QHIWAT, 0, &lvalue);
5361 mutex_enter(&so->so_lock);
5362 dprintso(so, 1,
5363 ("got SO_SNDBUF %ld from q\n", lvalue));
5364 }
5365 value = (int)lvalue;
5366 option = &value;
5367 len = (t_uscalar_t)sizeof (so->so_sndbuf);
5368 break;
5369 }
5370 case SO_RCVBUF: {
5371 ssize_t lvalue;
5372
5373 /*
5374 * If the option has not been set then get a default
5375 * value from the read queue. This value is
5376 * returned if the transport fails
5377 * the T_SVR4_OPTMGMT_REQ.
5378 *
5379 * XXX If SO_RCVBUF has been set and this is an
5380 * XPG 4.2 application then do not ask the transport
5381 * since the transport might adjust the value and not
5382 * return exactly what was set by the application.
5383 * For non-XPG 4.2 application we return the value
5384 * that the transport is actually using.
5385 */
5386 lvalue = so->so_rcvbuf;
5387 if (lvalue == 0) {
5388 mutex_exit(&so->so_lock);
5389 (void) strqget(RD(strvp2wq(SOTOV(so))),
5390 QHIWAT, 0, &lvalue);
5391 mutex_enter(&so->so_lock);
5392 dprintso(so, 1,
5393 ("got SO_RCVBUF %ld from q\n", lvalue));
5394 } else if (flags & _SOGETSOCKOPT_XPG4_2) {
5395 value = (int)lvalue;
5396 option = &value;
5397 goto copyout; /* skip asking transport */
5398 }
5399 value = (int)lvalue;
5400 option = &value;
5401 len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5402 break;
5403 }
5404 case SO_DOMAIN:
5405 value = so->so_family;
5406 option = &value;
5407 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5408
5409 #ifdef notyet
5410 /*
5411 * We do not implement the semantics of these options
5412 * thus we shouldn't implement the options either.
5413 */
5414 case SO_SNDLOWAT:
5415 value = so->so_sndlowat;
5416 option = &value;
5417 break;
5418 case SO_RCVLOWAT:
5419 value = so->so_rcvlowat;
5420 option = &value;
5421 break;
5422 #endif /* notyet */
5423 case SO_SNDTIMEO:
5424 case SO_RCVTIMEO: {
5425 clock_t val;
5426
5427 if (option_name == SO_RCVTIMEO)
5428 val = drv_hztousec(so->so_rcvtimeo);
5429 else
5430 val = drv_hztousec(so->so_sndtimeo);
5431 tmo_val.tv_sec = val / (1000 * 1000);
5432 tmo_val.tv_usec = val % (1000 * 1000);
5433 if (get_udatamodel() == DATAMODEL_NONE ||
5434 get_udatamodel() == DATAMODEL_NATIVE) {
5435 option = &tmo_val;
5436 len = sizeof (struct timeval);
5437 } else {
5438 TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5439 option = &tmo_val32;
5440 len = sizeof (struct timeval32);
5441 }
5442 break;
5443 }
5444 case SO_SND_BUFINFO: {
5445 snd_bufinfo.sbi_wroff =
5446 (so->so_proto_props).sopp_wroff;
5447 snd_bufinfo.sbi_maxblk =
5448 (so->so_proto_props).sopp_maxblk;
5449 snd_bufinfo.sbi_maxpsz =
5450 (so->so_proto_props).sopp_maxpsz;
5451 snd_bufinfo.sbi_tail =
5452 (so->so_proto_props).sopp_tail;
5453 option = &snd_bufinfo;
5454 len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5455 break;
5456 }
5457 }
5458 }
5459
5460 mutex_exit(&so->so_lock);
5461
5462 /* Send request */
5463 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5464 optmgmt_req.MGMT_flags = T_CHECK;
5465 optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5466 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5467
5468 oh.level = level;
5469 oh.name = option_name;
5470 oh.len = maxlen;
5471
5472 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5473 &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5474 /* Let option management work in the presence of data flow control */
5475 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5476 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5477 mp = NULL;
5478 mutex_enter(&so->so_lock);
5479 if (error) {
5480 eprintsoline(so, error);
5481 goto done2;
5482 }
5483 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5484 (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5485 if (error) {
5486 if (option != NULL) {
5487 /* We have a fallback value */
5488 error = 0;
5489 goto copyout;
5490 }
5491 eprintsoline(so, error);
5492 goto done2;
5493 }
5494 ASSERT(mp);
5495 optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5496 opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5497 optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5498 if (opt_res == NULL) {
5499 if (option != NULL) {
5500 /* We have a fallback value */
5501 error = 0;
5502 goto copyout;
5503 }
5504 error = EPROTO;
5505 eprintsoline(so, error);
5506 goto done;
5507 }
5508 option = &opt_res[1];
5509
5510 /* check to ensure that the option is within bounds */
5511 if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5512 (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5513 if (option != NULL) {
5514 /* We have a fallback value */
5515 error = 0;
5516 goto copyout;
5517 }
5518 error = EPROTO;
5519 eprintsoline(so, error);
5520 goto done;
5521 }
5522
5523 len = opt_res->len;
5524
5525 copyout: {
5526 t_uscalar_t size = MIN(len, maxlen);
5527 bcopy(option, optval, size);
5528 bcopy(&size, optlenp, sizeof (size));
5529 }
5530 done:
5531 freemsg(mp);
5532 done2:
5533 so_unlock_single(so, SOLOCKED);
5534 mutex_exit(&so->so_lock);
5535
5536 return (error);
5537 }
5538
5539 /*
5540 * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5541 * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5542 * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5543 * setsockopt has to work even if the transport does not support the option.
5544 */
5545 /* ARGSUSED */
5546 int
sotpi_setsockopt(struct sonode * so,int level,int option_name,const void * optval,t_uscalar_t optlen,struct cred * cr)5547 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5548 const void *optval, t_uscalar_t optlen, struct cred *cr)
5549 {
5550 struct T_optmgmt_req optmgmt_req;
5551 struct opthdr oh;
5552 mblk_t *mp;
5553 int error = 0;
5554 boolean_t handled = B_FALSE;
5555
5556 dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5557 (void *)so, level, option_name, optval, optlen,
5558 pr_state(so->so_state, so->so_mode)));
5559
5560 /* X/Open requires this check */
5561 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5562 if (xnet_check_print)
5563 printf("sockfs: X/Open setsockopt check => EINVAL\n");
5564 return (EINVAL);
5565 }
5566
5567 mutex_enter(&so->so_lock);
5568 so_lock_single(so); /* Set SOLOCKED */
5569 mutex_exit(&so->so_lock);
5570
5571 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5572 optmgmt_req.MGMT_flags = T_NEGOTIATE;
5573 optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5574 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5575
5576 oh.level = level;
5577 oh.name = option_name;
5578 oh.len = optlen;
5579
5580 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5581 &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5582 /* Let option management work in the presence of data flow control */
5583 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5584 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5585 mp = NULL;
5586 mutex_enter(&so->so_lock);
5587 if (error) {
5588 eprintsoline(so, error);
5589 goto done2;
5590 }
5591 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5592 (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5593 if (error) {
5594 eprintsoline(so, error);
5595 goto done;
5596 }
5597 ASSERT(mp);
5598 /* No need to verify T_optmgmt_ack */
5599 freemsg(mp);
5600 done:
5601 /*
5602 * Check for SOL_SOCKET options and record their values.
5603 * If we know about a SOL_SOCKET parameter and the transport
5604 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5605 * EPROTO) we let the setsockopt succeed.
5606 */
5607 if (level == SOL_SOCKET) {
5608 /* Check parameters */
5609 switch (option_name) {
5610 case SO_DEBUG:
5611 case SO_REUSEADDR:
5612 case SO_REUSEPORT:
5613 case SO_KEEPALIVE:
5614 case SO_DONTROUTE:
5615 case SO_BROADCAST:
5616 case SO_USELOOPBACK:
5617 case SO_OOBINLINE:
5618 case SO_SNDBUF:
5619 case SO_RCVBUF:
5620 #ifdef notyet
5621 case SO_SNDLOWAT:
5622 case SO_RCVLOWAT:
5623 #endif /* notyet */
5624 case SO_DGRAM_ERRIND:
5625 if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5626 error = EINVAL;
5627 eprintsoline(so, error);
5628 goto done2;
5629 }
5630 ASSERT(optval);
5631 handled = B_TRUE;
5632 break;
5633 case SO_SNDTIMEO:
5634 case SO_RCVTIMEO:
5635 if (get_udatamodel() == DATAMODEL_NONE ||
5636 get_udatamodel() == DATAMODEL_NATIVE) {
5637 if (optlen != sizeof (struct timeval)) {
5638 error = EINVAL;
5639 eprintsoline(so, error);
5640 goto done2;
5641 }
5642 } else {
5643 if (optlen != sizeof (struct timeval32)) {
5644 error = EINVAL;
5645 eprintsoline(so, error);
5646 goto done2;
5647 }
5648 }
5649 ASSERT(optval);
5650 handled = B_TRUE;
5651 break;
5652 case SO_LINGER:
5653 if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5654 error = EINVAL;
5655 eprintsoline(so, error);
5656 goto done2;
5657 }
5658 ASSERT(optval);
5659 handled = B_TRUE;
5660 break;
5661 }
5662
5663 #define intvalue (*(int32_t *)optval)
5664
5665 switch (option_name) {
5666 case SO_TYPE:
5667 case SO_ERROR:
5668 case SO_ACCEPTCONN:
5669 /* Can't be set */
5670 error = ENOPROTOOPT;
5671 goto done2;
5672 case SO_LINGER: {
5673 struct linger *l = (struct linger *)optval;
5674
5675 so->so_linger.l_linger = l->l_linger;
5676 if (l->l_onoff) {
5677 so->so_linger.l_onoff = SO_LINGER;
5678 so->so_options |= SO_LINGER;
5679 } else {
5680 so->so_linger.l_onoff = 0;
5681 so->so_options &= ~SO_LINGER;
5682 }
5683 break;
5684 }
5685
5686 case SO_DEBUG:
5687 #ifdef SOCK_TEST
5688 if (intvalue & 2)
5689 sock_test_timelimit = 10 * hz;
5690 else
5691 sock_test_timelimit = 0;
5692
5693 if (intvalue & 4)
5694 do_useracc = 0;
5695 else
5696 do_useracc = 1;
5697 #endif /* SOCK_TEST */
5698 /* FALLTHRU */
5699 case SO_REUSEADDR:
5700 case SO_REUSEPORT:
5701 case SO_KEEPALIVE:
5702 case SO_DONTROUTE:
5703 case SO_BROADCAST:
5704 case SO_USELOOPBACK:
5705 case SO_OOBINLINE:
5706 case SO_DGRAM_ERRIND:
5707 if (intvalue != 0) {
5708 dprintso(so, 1,
5709 ("socket_setsockopt: setting 0x%x\n",
5710 option_name));
5711 so->so_options |= option_name;
5712 } else {
5713 dprintso(so, 1,
5714 ("socket_setsockopt: clearing 0x%x\n",
5715 option_name));
5716 so->so_options &= ~option_name;
5717 }
5718 break;
5719 /*
5720 * The following options are only returned by us when the
5721 * transport layer fails.
5722 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5723 * since the transport might adjust the value and not
5724 * return exactly what was set by the application.
5725 */
5726 case SO_SNDBUF:
5727 so->so_sndbuf = intvalue;
5728 break;
5729 case SO_RCVBUF:
5730 so->so_rcvbuf = intvalue;
5731 break;
5732 case SO_RCVPSH:
5733 so->so_rcv_timer_interval = intvalue;
5734 break;
5735 #ifdef notyet
5736 /*
5737 * We do not implement the semantics of these options
5738 * thus we shouldn't implement the options either.
5739 */
5740 case SO_SNDLOWAT:
5741 so->so_sndlowat = intvalue;
5742 break;
5743 case SO_RCVLOWAT:
5744 so->so_rcvlowat = intvalue;
5745 break;
5746 #endif /* notyet */
5747 case SO_SNDTIMEO:
5748 case SO_RCVTIMEO: {
5749 struct timeval tl;
5750 clock_t val;
5751
5752 if (get_udatamodel() == DATAMODEL_NONE ||
5753 get_udatamodel() == DATAMODEL_NATIVE)
5754 bcopy(&tl, (struct timeval *)optval,
5755 sizeof (struct timeval));
5756 else
5757 TIMEVAL32_TO_TIMEVAL(&tl,
5758 (struct timeval32 *)optval);
5759 val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5760 if (option_name == SO_RCVTIMEO)
5761 so->so_rcvtimeo = drv_usectohz(val);
5762 else
5763 so->so_sndtimeo = drv_usectohz(val);
5764 break;
5765 }
5766 }
5767 #undef intvalue
5768
5769 if (error) {
5770 if ((error == ENOPROTOOPT || error == EPROTO ||
5771 error == EINVAL) && handled) {
5772 dprintso(so, 1,
5773 ("setsockopt: ignoring error %d for 0x%x\n",
5774 error, option_name));
5775 error = 0;
5776 }
5777 }
5778 }
5779 done2:
5780 so_unlock_single(so, SOLOCKED);
5781 mutex_exit(&so->so_lock);
5782 return (error);
5783 }
5784
5785 /*
5786 * sotpi_close() is called when the last open reference goes away.
5787 */
5788 /* ARGSUSED */
5789 int
sotpi_close(struct sonode * so,int flag,struct cred * cr)5790 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5791 {
5792 struct vnode *vp = SOTOV(so);
5793 dev_t dev;
5794 int error = 0;
5795 sotpi_info_t *sti = SOTOTPI(so);
5796
5797 dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5798 (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5799
5800 dev = sti->sti_dev;
5801
5802 ASSERT(STREAMSTAB(getmajor(dev)));
5803
5804 mutex_enter(&so->so_lock);
5805 so_lock_single(so); /* Set SOLOCKED */
5806
5807 ASSERT(so_verify_oobstate(so));
5808
5809 if (sti->sti_nl7c_flags & NL7C_ENABLED) {
5810 sti->sti_nl7c_flags = 0;
5811 nl7c_close(so);
5812 }
5813
5814 if (vp->v_stream != NULL) {
5815 vnode_t *ux_vp;
5816
5817 if (so->so_family == AF_UNIX) {
5818 /* Could avoid this when CANTSENDMORE for !dgram */
5819 so_unix_close(so);
5820 }
5821
5822 mutex_exit(&so->so_lock);
5823 /*
5824 * Disassemble the linkage from the AF_UNIX underlying file
5825 * system vnode to this socket (by atomically clearing
5826 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5827 * and frees the stream head.
5828 */
5829 if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5830 ASSERT(ux_vp->v_stream);
5831 sti->sti_ux_bound_vp = NULL;
5832 vn_rele_stream(ux_vp);
5833 }
5834 error = strclose(vp, flag, cr);
5835 vp->v_stream = NULL;
5836 mutex_enter(&so->so_lock);
5837 }
5838
5839 /*
5840 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5841 */
5842 so_flush_discon_ind(so);
5843
5844 so_unlock_single(so, SOLOCKED);
5845 mutex_exit(&so->so_lock);
5846
5847 /*
5848 * Needed for STREAMs.
5849 * Decrement the device driver's reference count for streams
5850 * opened via the clone dip. The driver was held in clone_open().
5851 * The absence of clone_close() forces this asymmetry.
5852 */
5853 if (so->so_flag & SOCLONE)
5854 ddi_rele_driver(getmajor(dev));
5855
5856 return (error);
5857 }
5858
5859 static int
sotpi_ioctl(struct sonode * so,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5860 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5861 struct cred *cr, int32_t *rvalp)
5862 {
5863 struct vnode *vp = SOTOV(so);
5864 sotpi_info_t *sti = SOTOTPI(so);
5865 int error = 0;
5866
5867 dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5868 cmd, arg, pr_state(so->so_state, so->so_mode)));
5869
5870 switch (cmd) {
5871 case SIOCSQPTR:
5872 /*
5873 * SIOCSQPTR is valid only when helper stream is created
5874 * by the protocol.
5875 */
5876 case _I_INSERT:
5877 case _I_REMOVE:
5878 /*
5879 * Since there's no compelling reason to support these ioctls
5880 * on sockets, and doing so would increase the complexity
5881 * markedly, prevent it.
5882 */
5883 return (EOPNOTSUPP);
5884
5885 case I_FIND:
5886 case I_LIST:
5887 case I_LOOK:
5888 case I_POP:
5889 case I_PUSH:
5890 /*
5891 * To prevent races and inconsistencies between the actual
5892 * state of the stream and the state according to the sonode,
5893 * we serialize all operations which modify or operate on the
5894 * list of modules on the socket's stream.
5895 */
5896 mutex_enter(&sti->sti_plumb_lock);
5897 error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5898 mutex_exit(&sti->sti_plumb_lock);
5899 return (error);
5900
5901 default:
5902 if (so->so_version != SOV_STREAM)
5903 break;
5904
5905 /*
5906 * The imaginary "sockmod" has been popped; act as a stream.
5907 */
5908 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5909 }
5910
5911 ASSERT(so->so_version != SOV_STREAM);
5912
5913 /*
5914 * Process socket-specific ioctls.
5915 */
5916 switch (cmd) {
5917 case FIONBIO: {
5918 int32_t value;
5919
5920 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5921 (mode & (int)FKIOCTL)))
5922 return (EFAULT);
5923
5924 mutex_enter(&so->so_lock);
5925 if (value) {
5926 so->so_state |= SS_NDELAY;
5927 } else {
5928 so->so_state &= ~SS_NDELAY;
5929 }
5930 mutex_exit(&so->so_lock);
5931 return (0);
5932 }
5933
5934 case FIOASYNC: {
5935 int32_t value;
5936
5937 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5938 (mode & (int)FKIOCTL)))
5939 return (EFAULT);
5940
5941 mutex_enter(&so->so_lock);
5942 /*
5943 * SS_ASYNC flag not already set correctly?
5944 * (!value != !(so->so_state & SS_ASYNC))
5945 * but some engineers find that too hard to read.
5946 */
5947 if (value == 0 && (so->so_state & SS_ASYNC) != 0 ||
5948 value != 0 && (so->so_state & SS_ASYNC) == 0)
5949 error = so_flip_async(so, vp, mode, cr);
5950 mutex_exit(&so->so_lock);
5951 return (error);
5952 }
5953
5954 case SIOCSPGRP:
5955 case FIOSETOWN: {
5956 pid_t pgrp;
5957
5958 if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5959 (mode & (int)FKIOCTL)))
5960 return (EFAULT);
5961
5962 mutex_enter(&so->so_lock);
5963 dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5964 /* Any change? */
5965 if (pgrp != so->so_pgrp)
5966 error = so_set_siggrp(so, vp, pgrp, mode, cr);
5967 mutex_exit(&so->so_lock);
5968 return (error);
5969 }
5970 case SIOCGPGRP:
5971 case FIOGETOWN:
5972 if (so_copyout(&so->so_pgrp, (void *)arg,
5973 sizeof (pid_t), (mode & (int)FKIOCTL)))
5974 return (EFAULT);
5975 return (0);
5976
5977 case SIOCATMARK: {
5978 int retval;
5979 uint_t so_state;
5980
5981 /*
5982 * strwaitmark has a finite timeout after which it
5983 * returns -1 if the mark state is undetermined.
5984 * In order to avoid any race between the mark state
5985 * in sockfs and the mark state in the stream head this
5986 * routine loops until the mark state can be determined
5987 * (or the urgent data indication has been removed by some
5988 * other thread).
5989 */
5990 do {
5991 mutex_enter(&so->so_lock);
5992 so_state = so->so_state;
5993 mutex_exit(&so->so_lock);
5994 if (so_state & SS_RCVATMARK) {
5995 retval = 1;
5996 } else if (!(so_state & SS_OOBPEND)) {
5997 /*
5998 * No SIGURG has been generated -- there is no
5999 * pending or present urgent data. Thus can't
6000 * possibly be at the mark.
6001 */
6002 retval = 0;
6003 } else {
6004 /*
6005 * Have the stream head wait until there is
6006 * either some messages on the read queue, or
6007 * STRATMARK or STRNOTATMARK gets set. The
6008 * STRNOTATMARK flag is used so that the
6009 * transport can send up a MSGNOTMARKNEXT
6010 * M_DATA to indicate that it is not
6011 * at the mark and additional data is not about
6012 * to be send upstream.
6013 *
6014 * If the mark state is undetermined this will
6015 * return -1 and we will loop rechecking the
6016 * socket state.
6017 */
6018 retval = strwaitmark(vp);
6019 }
6020 } while (retval == -1);
6021
6022 if (so_copyout(&retval, (void *)arg, sizeof (int),
6023 (mode & (int)FKIOCTL)))
6024 return (EFAULT);
6025 return (0);
6026 }
6027
6028 case I_FDINSERT:
6029 case I_SENDFD:
6030 case I_RECVFD:
6031 case I_ATMARK:
6032 case _SIOCSOCKFALLBACK:
6033 /*
6034 * These ioctls do not apply to sockets. I_FDINSERT can be
6035 * used to send M_PROTO messages without modifying the socket
6036 * state. I_SENDFD/RECVFD should not be used for socket file
6037 * descriptor passing since they assume a twisted stream.
6038 * SIOCATMARK must be used instead of I_ATMARK.
6039 *
6040 * _SIOCSOCKFALLBACK from an application should never be
6041 * processed. It is only generated by socktpi_open() or
6042 * in response to I_POP or I_PUSH.
6043 */
6044 #ifdef DEBUG
6045 zcmn_err(getzoneid(), CE_WARN,
6046 "Unsupported STREAMS ioctl 0x%x on socket. "
6047 "Pid = %d\n", cmd, curproc->p_pid);
6048 #endif /* DEBUG */
6049 return (EOPNOTSUPP);
6050
6051 case _I_GETPEERCRED:
6052 if ((mode & FKIOCTL) == 0)
6053 return (EINVAL);
6054
6055 mutex_enter(&so->so_lock);
6056 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
6057 error = ENOTSUP;
6058 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
6059 error = ENOTCONN;
6060 } else if (so->so_peercred != NULL) {
6061 k_peercred_t *kp = (k_peercred_t *)arg;
6062 kp->pc_cr = so->so_peercred;
6063 kp->pc_cpid = so->so_cpid;
6064 crhold(so->so_peercred);
6065 } else {
6066 error = EINVAL;
6067 }
6068 mutex_exit(&so->so_lock);
6069 return (error);
6070
6071 default:
6072 /*
6073 * Do the higher-order bits of the ioctl cmd indicate
6074 * that it is an I_* streams ioctl?
6075 */
6076 if ((cmd & 0xffffff00U) == STR &&
6077 so->so_version == SOV_SOCKBSD) {
6078 #ifdef DEBUG
6079 zcmn_err(getzoneid(), CE_WARN,
6080 "Unsupported STREAMS ioctl 0x%x on socket. "
6081 "Pid = %d\n", cmd, curproc->p_pid);
6082 #endif /* DEBUG */
6083 return (EOPNOTSUPP);
6084 }
6085 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6086 }
6087 }
6088
6089 /*
6090 * Handle plumbing-related ioctls.
6091 */
6092 static int
socktpi_plumbioctl(struct vnode * vp,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)6093 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
6094 struct cred *cr, int32_t *rvalp)
6095 {
6096 static const char sockmod_name[] = "sockmod";
6097 struct sonode *so = VTOSO(vp);
6098 char mname[FMNAMESZ + 1];
6099 int error;
6100 sotpi_info_t *sti = SOTOTPI(so);
6101
6102 ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
6103
6104 if (so->so_version == SOV_SOCKBSD)
6105 return (EOPNOTSUPP);
6106
6107 if (so->so_version == SOV_STREAM) {
6108 /*
6109 * The imaginary "sockmod" has been popped - act as a stream.
6110 * If this is a push of sockmod then change back to a socket.
6111 */
6112 if (cmd == I_PUSH) {
6113 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6114 (void *)arg, mname, sizeof (mname), NULL);
6115
6116 if (error == 0 && strcmp(mname, sockmod_name) == 0) {
6117 dprintso(so, 0, ("socktpi_ioctl: going to "
6118 "socket version\n"));
6119 so_stream2sock(so);
6120 return (0);
6121 }
6122 }
6123 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6124 }
6125
6126 switch (cmd) {
6127 case I_PUSH:
6128 if (sti->sti_direct) {
6129 mutex_enter(&so->so_lock);
6130 so_lock_single(so);
6131 mutex_exit(&so->so_lock);
6132
6133 error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
6134 cr, rvalp);
6135
6136 mutex_enter(&so->so_lock);
6137 if (error == 0)
6138 sti->sti_direct = 0;
6139 so_unlock_single(so, SOLOCKED);
6140 mutex_exit(&so->so_lock);
6141
6142 if (error != 0)
6143 return (error);
6144 }
6145
6146 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6147 if (error == 0)
6148 sti->sti_pushcnt++;
6149 return (error);
6150
6151 case I_POP:
6152 if (sti->sti_pushcnt == 0) {
6153 /* Emulate sockmod being popped */
6154 dprintso(so, 0,
6155 ("socktpi_ioctl: going to STREAMS version\n"));
6156 return (so_sock2stream(so));
6157 }
6158
6159 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6160 if (error == 0)
6161 sti->sti_pushcnt--;
6162 return (error);
6163
6164 case I_LIST: {
6165 struct str_mlist *kmlistp, *umlistp;
6166 struct str_list kstrlist;
6167 ssize_t kstrlistsize;
6168 int i, nmods;
6169
6170 STRUCT_DECL(str_list, ustrlist);
6171 STRUCT_INIT(ustrlist, mode);
6172
6173 if (arg == NULL) {
6174 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6175 if (error == 0)
6176 (*rvalp)++; /* Add one for sockmod */
6177 return (error);
6178 }
6179
6180 error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6181 STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6182 if (error != 0)
6183 return (error);
6184
6185 nmods = STRUCT_FGET(ustrlist, sl_nmods);
6186 if (nmods <= 0)
6187 return (EINVAL);
6188 /*
6189 * Ceiling nmods at nstrpush to prevent someone from
6190 * maliciously consuming lots of kernel memory.
6191 */
6192 nmods = MIN(nmods, nstrpush);
6193
6194 kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6195 kstrlist.sl_nmods = nmods;
6196 kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6197
6198 error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6199 cr, rvalp);
6200 if (error != 0)
6201 goto done;
6202
6203 /*
6204 * Considering the module list as a 0-based array of sl_nmods
6205 * modules, sockmod should conceptually exist at slot
6206 * sti_pushcnt. Insert sockmod at this location by sliding all
6207 * of the module names after so_pushcnt over by one. We know
6208 * that there will be room to do this since we allocated
6209 * sl_modlist with an additional slot.
6210 */
6211 for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6212 kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6213
6214 (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6215 kstrlist.sl_nmods++;
6216
6217 /*
6218 * Copy all of the entries out to ustrlist.
6219 */
6220 kmlistp = kstrlist.sl_modlist;
6221 umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6222 for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6223 error = so_copyout(kmlistp++, umlistp++,
6224 sizeof (struct str_mlist), mode & FKIOCTL);
6225 if (error != 0)
6226 goto done;
6227 }
6228
6229 error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6230 mode & FKIOCTL);
6231 if (error == 0)
6232 *rvalp = 0;
6233 done:
6234 kmem_free(kstrlist.sl_modlist, kstrlistsize);
6235 return (error);
6236 }
6237 case I_LOOK:
6238 if (sti->sti_pushcnt == 0) {
6239 return (so_copyout(sockmod_name, (void *)arg,
6240 sizeof (sockmod_name), mode & FKIOCTL));
6241 }
6242 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6243
6244 case I_FIND:
6245 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6246 if (error && error != EINVAL)
6247 return (error);
6248
6249 /* if not found and string was sockmod return 1 */
6250 if (*rvalp == 0 || error == EINVAL) {
6251 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6252 (void *)arg, mname, sizeof (mname), NULL);
6253 if (error == ENAMETOOLONG)
6254 error = EINVAL;
6255
6256 if (error == 0 && strcmp(mname, sockmod_name) == 0)
6257 *rvalp = 1;
6258 }
6259 return (error);
6260
6261 default:
6262 panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6263 break;
6264 }
6265
6266 return (0);
6267 }
6268
6269 /*
6270 * Wrapper around the streams poll routine that implements socket poll
6271 * semantics.
6272 * The sockfs never calls pollwakeup itself - the stream head take care
6273 * of all pollwakeups. Since sockfs never holds so_lock when calling the
6274 * stream head there can never be a deadlock due to holding so_lock across
6275 * pollwakeup and acquiring so_lock in this routine.
6276 *
6277 * However, since the performance of VOP_POLL is critical we avoid
6278 * acquiring so_lock here. This is based on two assumptions:
6279 * - The poll implementation holds locks to serialize the VOP_POLL call
6280 * and a pollwakeup for the same pollhead. This ensures that should
6281 * e.g. so_state change during a socktpi_poll call the pollwakeup
6282 * (which strsock_* and strrput conspire to issue) is issued after
6283 * the state change. Thus the pollwakeup will block until VOP_POLL has
6284 * returned and then wake up poll and have it call VOP_POLL again.
6285 * - The reading of so_state without holding so_lock does not result in
6286 * stale data that is older than the latest state change that has dropped
6287 * so_lock. This is ensured by the mutex_exit issuing the appropriate
6288 * memory barrier to force the data into the coherency domain.
6289 */
6290 static int
sotpi_poll(struct sonode * so,short events,int anyyet,short * reventsp,struct pollhead ** phpp)6291 sotpi_poll(
6292 struct sonode *so,
6293 short events,
6294 int anyyet,
6295 short *reventsp,
6296 struct pollhead **phpp)
6297 {
6298 short origevents = events;
6299 struct vnode *vp = SOTOV(so);
6300 int error;
6301 int so_state = so->so_state; /* snapshot */
6302 sotpi_info_t *sti = SOTOTPI(so);
6303
6304 dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6305 (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6306
6307 ASSERT(vp->v_type == VSOCK);
6308 ASSERT(vp->v_stream != NULL);
6309
6310 if (so->so_version == SOV_STREAM) {
6311 /* The imaginary "sockmod" has been popped - act as a stream */
6312 return (strpoll(vp->v_stream, events, anyyet,
6313 reventsp, phpp));
6314 }
6315
6316 if (!(so_state & SS_ISCONNECTED) &&
6317 (so->so_mode & SM_CONNREQUIRED)) {
6318 /* Not connected yet - turn off write side events */
6319 events &= ~(POLLOUT|POLLWRBAND);
6320 }
6321 /*
6322 * Check for errors without calling strpoll if the caller wants them.
6323 * In sockets the errors are represented as input/output events
6324 * and there is no need to ask the stream head for this information.
6325 */
6326 if (so->so_error != 0 &&
6327 ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
6328 *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6329 return (0);
6330 }
6331 /*
6332 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6333 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6334 * will not trigger a POLLIN event with POLLRDDATA set.
6335 * The handling of urgent data (causing POLLRDBAND) is done by
6336 * inspecting SS_OOBPEND below.
6337 */
6338 events |= POLLRDDATA;
6339
6340 /*
6341 * After shutdown(output) a stream head write error is set.
6342 * However, we should not return output events.
6343 */
6344 events |= POLLNOERR;
6345 error = strpoll(vp->v_stream, events, anyyet,
6346 reventsp, phpp);
6347 if (error)
6348 return (error);
6349
6350 ASSERT(!(*reventsp & POLLERR));
6351
6352 /*
6353 * Notes on T_CONN_IND handling for sockets.
6354 *
6355 * If strpoll() returned without events, SR_POLLIN is guaranteed
6356 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6357 *
6358 * Since the so_lock is not held, soqueueconnind() may have run
6359 * and a T_CONN_IND may be waiting. We now check for any queued
6360 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6361 * to ensure poll returns.
6362 *
6363 * However:
6364 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6365 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6366 * the following actions will occur; taken together they ensure the
6367 * syscall will return.
6368 *
6369 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6370 * the accept() was run on a non-blocking socket sowaitconnind()
6371 * may have already returned EWOULDBLOCK, so not be waiting to
6372 * process the message. Additionally socktpi_poll() has probably
6373 * proceeded past the sti_conn_ind_head check below.
6374 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6375 * this thread, however that could occur before poll_common()
6376 * has entered cv_wait.
6377 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6378 *
6379 * Before proceeding to cv_wait() in poll_common() for an event,
6380 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6381 * and if set, re-calls strpoll() to ensure the late arriving
6382 * T_CONN_IND is recognized, and pollsys() returns.
6383 */
6384
6385 if (sti->sti_conn_ind_head != NULL)
6386 *reventsp |= (POLLIN|POLLRDNORM) & events;
6387
6388 if (so->so_state & SS_CANTRCVMORE) {
6389 *reventsp |= POLLRDHUP & events;
6390
6391 if (so->so_state & SS_CANTSENDMORE)
6392 *reventsp |= POLLHUP;
6393 }
6394
6395 if (so->so_state & SS_OOBPEND)
6396 *reventsp |= POLLRDBAND & events;
6397
6398 if (sti->sti_nl7c_rcv_mp != NULL) {
6399 *reventsp |= (POLLIN|POLLRDNORM) & events;
6400 }
6401 if ((sti->sti_nl7c_flags & NL7C_ENABLED) &&
6402 ((POLLIN|POLLRDNORM) & *reventsp)) {
6403 sti->sti_nl7c_flags |= NL7C_POLLIN;
6404 }
6405
6406 return (0);
6407 }
6408
6409 /*ARGSUSED*/
6410 static int
socktpi_constructor(void * buf,void * cdrarg,int kmflags)6411 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6412 {
6413 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6414 int error = 0;
6415
6416 error = sonode_constructor(buf, cdrarg, kmflags);
6417 if (error != 0)
6418 return (error);
6419
6420 error = i_sotpi_info_constructor(&st->st_info);
6421 if (error != 0)
6422 sonode_destructor(buf, cdrarg);
6423
6424 st->st_sonode.so_priv = &st->st_info;
6425
6426 return (error);
6427 }
6428
6429 /*ARGSUSED1*/
6430 static void
socktpi_destructor(void * buf,void * cdrarg)6431 socktpi_destructor(void *buf, void *cdrarg)
6432 {
6433 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6434
6435 ASSERT(st->st_sonode.so_priv == &st->st_info);
6436 st->st_sonode.so_priv = NULL;
6437
6438 i_sotpi_info_destructor(&st->st_info);
6439 sonode_destructor(buf, cdrarg);
6440 }
6441
6442 static int
socktpi_unix_constructor(void * buf,void * cdrarg,int kmflags)6443 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6444 {
6445 int retval;
6446
6447 if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6448 struct sonode *so = (struct sonode *)buf;
6449 sotpi_info_t *sti = SOTOTPI(so);
6450
6451 mutex_enter(&socklist.sl_lock);
6452
6453 sti->sti_next_so = socklist.sl_list;
6454 sti->sti_prev_so = NULL;
6455 if (sti->sti_next_so != NULL)
6456 SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6457 socklist.sl_list = so;
6458
6459 mutex_exit(&socklist.sl_lock);
6460
6461 }
6462 return (retval);
6463 }
6464
6465 static void
socktpi_unix_destructor(void * buf,void * cdrarg)6466 socktpi_unix_destructor(void *buf, void *cdrarg)
6467 {
6468 struct sonode *so = (struct sonode *)buf;
6469 sotpi_info_t *sti = SOTOTPI(so);
6470
6471 mutex_enter(&socklist.sl_lock);
6472
6473 if (sti->sti_next_so != NULL)
6474 SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6475 if (sti->sti_prev_so != NULL)
6476 SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6477 else
6478 socklist.sl_list = sti->sti_next_so;
6479
6480 mutex_exit(&socklist.sl_lock);
6481
6482 socktpi_destructor(buf, cdrarg);
6483 }
6484
6485 int
socktpi_init(void)6486 socktpi_init(void)
6487 {
6488 /*
6489 * Create sonode caches. We create a special one for AF_UNIX so
6490 * that we can track them for netstat(1m).
6491 */
6492 socktpi_cache = kmem_cache_create("socktpi_cache",
6493 sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6494 socktpi_destructor, NULL, NULL, NULL, 0);
6495
6496 socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6497 sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6498 socktpi_unix_destructor, NULL, NULL, NULL, 0);
6499
6500 return (0);
6501 }
6502
6503 /*
6504 * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6505 *
6506 * Caller must still update state and mode using sotpi_update_state().
6507 */
6508 int
sotpi_convert_sonode(struct sonode * so,struct sockparams * newsp,boolean_t * direct,queue_t ** qp,struct cred * cr)6509 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6510 boolean_t *direct, queue_t **qp, struct cred *cr)
6511 {
6512 sotpi_info_t *sti;
6513 struct sockparams *origsp = so->so_sockparams;
6514 sock_lower_handle_t handle = so->so_proto_handle;
6515 struct stdata *stp;
6516 struct vnode *vp;
6517 queue_t *q;
6518 int error = 0;
6519
6520 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6521 SS_FALLBACK_PENDING);
6522 ASSERT(SOCK_IS_NONSTR(so));
6523
6524 *qp = NULL;
6525 *direct = B_FALSE;
6526 so->so_sockparams = newsp;
6527 /*
6528 * Allocate and initalize fields required by TPI.
6529 */
6530 (void) sotpi_info_create(so, KM_SLEEP);
6531 sotpi_info_init(so);
6532
6533 if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6534 sotpi_info_fini(so);
6535 sotpi_info_destroy(so);
6536 return (error);
6537 }
6538 ASSERT(handle == so->so_proto_handle);
6539 sti = SOTOTPI(so);
6540 if (sti->sti_direct != 0)
6541 *direct = B_TRUE;
6542
6543 /*
6544 * Keep the original sp around so we can properly dispose of the
6545 * sonode when the socket is being closed.
6546 */
6547 sti->sti_orig_sp = origsp;
6548
6549 so_basic_strinit(so); /* skips the T_CAPABILITY_REQ */
6550 so_alloc_addr(so, so->so_max_addr_len);
6551
6552 /*
6553 * If the application has done a SIOCSPGRP, make sure the
6554 * STREAM head is aware. This needs to take place before
6555 * the protocol start sending up messages. Otherwise we
6556 * might miss to generate SIGPOLL.
6557 *
6558 * It is possible that the application will receive duplicate
6559 * signals if some were already generated for either data or
6560 * connection indications.
6561 */
6562 if (so->so_pgrp != 0) {
6563 if (so_set_events(so, so->so_vnode, cr) != 0)
6564 so->so_pgrp = 0;
6565 }
6566
6567 /*
6568 * Determine which queue to use.
6569 */
6570 vp = SOTOV(so);
6571 stp = vp->v_stream;
6572 ASSERT(stp != NULL);
6573 q = stp->sd_wrq->q_next;
6574
6575 /*
6576 * Skip any modules that may have been auto pushed when the device
6577 * was opened
6578 */
6579 while (q->q_next != NULL)
6580 q = q->q_next;
6581 *qp = _RD(q);
6582
6583 /* This is now a STREAMS sockets */
6584 so->so_not_str = B_FALSE;
6585
6586 return (error);
6587 }
6588
6589 /*
6590 * Revert a TPI sonode. It is only allowed to revert the sonode during
6591 * the fallback process.
6592 */
6593 void
sotpi_revert_sonode(struct sonode * so,struct cred * cr)6594 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6595 {
6596 vnode_t *vp = SOTOV(so);
6597
6598 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6599 SS_FALLBACK_PENDING);
6600 ASSERT(!SOCK_IS_NONSTR(so));
6601 ASSERT(vp->v_stream != NULL);
6602
6603 strclean(vp);
6604 (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6605
6606 /*
6607 * Restore the original sockparams. The caller is responsible for
6608 * dropping the ref to the new sp.
6609 */
6610 so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6611
6612 sotpi_info_fini(so);
6613 sotpi_info_destroy(so);
6614
6615 /* This is no longer a STREAMS sockets */
6616 so->so_not_str = B_TRUE;
6617 }
6618
6619 void
sotpi_update_state(struct sonode * so,struct T_capability_ack * tcap,struct sockaddr * laddr,socklen_t laddrlen,struct sockaddr * faddr,socklen_t faddrlen,short opts)6620 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6621 struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6622 socklen_t faddrlen, short opts)
6623 {
6624 sotpi_info_t *sti = SOTOTPI(so);
6625
6626 so_proc_tcapability_ack(so, tcap);
6627
6628 so->so_options |= opts;
6629
6630 /*
6631 * Determine whether the foreign and local address are valid
6632 */
6633 if (laddrlen != 0) {
6634 ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6635 sti->sti_laddr_len = laddrlen;
6636 bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6637 sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6638 }
6639
6640 if (faddrlen != 0) {
6641 ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6642 sti->sti_faddr_len = faddrlen;
6643 bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6644 sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6645 }
6646
6647 }
6648
6649 /*
6650 * Allocate enough space to cache the local and foreign addresses.
6651 */
6652 void
so_alloc_addr(struct sonode * so,t_uscalar_t maxlen)6653 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6654 {
6655 sotpi_info_t *sti = SOTOTPI(so);
6656
6657 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6658 ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6659 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6660 P2ROUNDUP(maxlen, KMEM_ALIGN);
6661 so->so_max_addr_len = sti->sti_laddr_maxlen;
6662 sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6663 sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6664 + sti->sti_laddr_maxlen);
6665
6666 if (so->so_family == AF_UNIX) {
6667 /*
6668 * Initialize AF_UNIX related fields.
6669 */
6670 bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6671 bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6672 }
6673 }
6674
6675
6676 sotpi_info_t *
sotpi_sototpi(struct sonode * so)6677 sotpi_sototpi(struct sonode *so)
6678 {
6679 sotpi_info_t *sti;
6680
6681 ASSERT(so != NULL);
6682
6683 sti = (sotpi_info_t *)so->so_priv;
6684
6685 ASSERT(sti != NULL);
6686 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6687
6688 return (sti);
6689 }
6690
6691 static int
i_sotpi_info_constructor(sotpi_info_t * sti)6692 i_sotpi_info_constructor(sotpi_info_t *sti)
6693 {
6694 sti->sti_magic = SOTPI_INFO_MAGIC;
6695 sti->sti_ack_mp = NULL;
6696 sti->sti_discon_ind_mp = NULL;
6697 sti->sti_ux_bound_vp = NULL;
6698 sti->sti_unbind_mp = NULL;
6699
6700 sti->sti_conn_ind_head = NULL;
6701 sti->sti_conn_ind_tail = NULL;
6702
6703 sti->sti_laddr_sa = NULL;
6704 sti->sti_faddr_sa = NULL;
6705
6706 sti->sti_nl7c_flags = 0;
6707 sti->sti_nl7c_uri = NULL;
6708 sti->sti_nl7c_rcv_mp = NULL;
6709
6710 mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6711 cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6712
6713 return (0);
6714 }
6715
6716 static void
i_sotpi_info_destructor(sotpi_info_t * sti)6717 i_sotpi_info_destructor(sotpi_info_t *sti)
6718 {
6719 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6720 ASSERT(sti->sti_ack_mp == NULL);
6721 ASSERT(sti->sti_discon_ind_mp == NULL);
6722 ASSERT(sti->sti_ux_bound_vp == NULL);
6723 ASSERT(sti->sti_unbind_mp == NULL);
6724
6725 ASSERT(sti->sti_conn_ind_head == NULL);
6726 ASSERT(sti->sti_conn_ind_tail == NULL);
6727
6728 ASSERT(sti->sti_laddr_sa == NULL);
6729 ASSERT(sti->sti_faddr_sa == NULL);
6730
6731 ASSERT(sti->sti_nl7c_flags == 0);
6732 ASSERT(sti->sti_nl7c_uri == NULL);
6733 ASSERT(sti->sti_nl7c_rcv_mp == NULL);
6734
6735 mutex_destroy(&sti->sti_plumb_lock);
6736 cv_destroy(&sti->sti_ack_cv);
6737 }
6738
6739 /*
6740 * Creates and attaches TPI information to the given sonode
6741 */
6742 static boolean_t
sotpi_info_create(struct sonode * so,int kmflags)6743 sotpi_info_create(struct sonode *so, int kmflags)
6744 {
6745 sotpi_info_t *sti;
6746
6747 ASSERT(so->so_priv == NULL);
6748
6749 if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6750 return (B_FALSE);
6751
6752 if (i_sotpi_info_constructor(sti) != 0) {
6753 kmem_free(sti, sizeof (*sti));
6754 return (B_FALSE);
6755 }
6756
6757 so->so_priv = (void *)sti;
6758 return (B_TRUE);
6759 }
6760
6761 /*
6762 * Initializes the TPI information.
6763 */
6764 static void
sotpi_info_init(struct sonode * so)6765 sotpi_info_init(struct sonode *so)
6766 {
6767 struct vnode *vp = SOTOV(so);
6768 sotpi_info_t *sti = SOTOTPI(so);
6769 time_t now;
6770
6771 sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6772 vp->v_rdev = sti->sti_dev;
6773
6774 sti->sti_orig_sp = NULL;
6775
6776 sti->sti_pushcnt = 0;
6777
6778 now = gethrestime_sec();
6779 sti->sti_atime = now;
6780 sti->sti_mtime = now;
6781 sti->sti_ctime = now;
6782
6783 sti->sti_eaddr_mp = NULL;
6784 sti->sti_delayed_error = 0;
6785
6786 sti->sti_provinfo = NULL;
6787
6788 sti->sti_oobcnt = 0;
6789 sti->sti_oobsigcnt = 0;
6790
6791 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6792
6793 sti->sti_laddr_sa = 0;
6794 sti->sti_faddr_sa = 0;
6795 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6796 sti->sti_laddr_len = sti->sti_faddr_len = 0;
6797
6798 sti->sti_laddr_valid = 0;
6799 sti->sti_faddr_valid = 0;
6800 sti->sti_faddr_noxlate = 0;
6801
6802 sti->sti_direct = 0;
6803
6804 ASSERT(sti->sti_ack_mp == NULL);
6805 ASSERT(sti->sti_ux_bound_vp == NULL);
6806 ASSERT(sti->sti_unbind_mp == NULL);
6807
6808 ASSERT(sti->sti_conn_ind_head == NULL);
6809 ASSERT(sti->sti_conn_ind_tail == NULL);
6810 }
6811
6812 /*
6813 * Given a sonode, grab the TPI info and free any data.
6814 */
6815 static void
sotpi_info_fini(struct sonode * so)6816 sotpi_info_fini(struct sonode *so)
6817 {
6818 sotpi_info_t *sti = SOTOTPI(so);
6819 mblk_t *mp;
6820
6821 ASSERT(sti->sti_discon_ind_mp == NULL);
6822
6823 if ((mp = sti->sti_conn_ind_head) != NULL) {
6824 mblk_t *mp1;
6825
6826 while (mp) {
6827 mp1 = mp->b_next;
6828 mp->b_next = NULL;
6829 freemsg(mp);
6830 mp = mp1;
6831 }
6832 sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6833 }
6834
6835 /*
6836 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6837 * indirect them. It also uses so_count as a validity test.
6838 */
6839 mutex_enter(&so->so_lock);
6840
6841 if (sti->sti_laddr_sa) {
6842 ASSERT((caddr_t)sti->sti_faddr_sa ==
6843 (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6844 ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6845 sti->sti_laddr_valid = 0;
6846 sti->sti_faddr_valid = 0;
6847 kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6848 sti->sti_laddr_sa = NULL;
6849 sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6850 sti->sti_faddr_sa = NULL;
6851 sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6852 }
6853
6854 mutex_exit(&so->so_lock);
6855
6856 if ((mp = sti->sti_eaddr_mp) != NULL) {
6857 freemsg(mp);
6858 sti->sti_eaddr_mp = NULL;
6859 sti->sti_delayed_error = 0;
6860 }
6861
6862 if ((mp = sti->sti_ack_mp) != NULL) {
6863 freemsg(mp);
6864 sti->sti_ack_mp = NULL;
6865 }
6866
6867 if ((mp = sti->sti_nl7c_rcv_mp) != NULL) {
6868 sti->sti_nl7c_rcv_mp = NULL;
6869 freemsg(mp);
6870 }
6871 sti->sti_nl7c_rcv_rval = 0;
6872 if (sti->sti_nl7c_uri != NULL) {
6873 nl7c_urifree(so);
6874 /* urifree() cleared nl7c_uri */
6875 }
6876 if (sti->sti_nl7c_flags) {
6877 sti->sti_nl7c_flags = 0;
6878 }
6879
6880 ASSERT(sti->sti_ux_bound_vp == NULL);
6881 if ((mp = sti->sti_unbind_mp) != NULL) {
6882 freemsg(mp);
6883 sti->sti_unbind_mp = NULL;
6884 }
6885 }
6886
6887 /*
6888 * Destroys the TPI information attached to a sonode.
6889 */
6890 static void
sotpi_info_destroy(struct sonode * so)6891 sotpi_info_destroy(struct sonode *so)
6892 {
6893 sotpi_info_t *sti = SOTOTPI(so);
6894
6895 i_sotpi_info_destructor(sti);
6896 kmem_free(sti, sizeof (*sti));
6897
6898 so->so_priv = NULL;
6899 }
6900
6901 /*
6902 * Create the global sotpi socket module entry. It will never be freed.
6903 */
6904 smod_info_t *
sotpi_smod_create(void)6905 sotpi_smod_create(void)
6906 {
6907 smod_info_t *smodp;
6908
6909 smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6910 smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6911 (void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6912 /*
6913 * Initialize the smod_refcnt to 1 so it will never be freed.
6914 */
6915 smodp->smod_refcnt = 1;
6916 smodp->smod_uc_version = SOCK_UC_VERSION;
6917 smodp->smod_dc_version = SOCK_DC_VERSION;
6918 smodp->smod_sock_create_func = &sotpi_create;
6919 smodp->smod_sock_destroy_func = &sotpi_destroy;
6920 return (smodp);
6921 }
6922