1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2015, Joyent, Inc.
25 * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
26 * Copyright 2022 Garrett D'Amore
27 * Copyright 2024 Oxide Computer Company
28 */
29
30 #include <sys/types.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/buf.h>
35 #include <sys/conf.h>
36 #include <sys/cred.h>
37 #include <sys/kmem.h>
38 #include <sys/kmem_impl.h>
39 #include <sys/sysmacros.h>
40 #include <sys/vfs.h>
41 #include <sys/vnode.h>
42 #include <sys/debug.h>
43 #include <sys/errno.h>
44 #include <sys/time.h>
45 #include <sys/file.h>
46 #include <sys/open.h>
47 #include <sys/user.h>
48 #include <sys/termios.h>
49 #include <sys/stream.h>
50 #include <sys/strsubr.h>
51 #include <sys/strsun.h>
52 #include <sys/suntpi.h>
53 #include <sys/ddi.h>
54 #include <sys/esunddi.h>
55 #include <sys/flock.h>
56 #include <sys/modctl.h>
57 #include <sys/vtrace.h>
58 #include <sys/cmn_err.h>
59 #include <sys/pathname.h>
60
61 #include <sys/socket.h>
62 #include <sys/socketvar.h>
63 #include <sys/sockio.h>
64 #include <netinet/in.h>
65 #include <sys/un.h>
66 #include <sys/strsun.h>
67
68 #include <sys/tiuser.h>
69 #define _SUN_TPI_VERSION 2
70 #include <sys/tihdr.h>
71 #include <sys/timod.h> /* TI_GETMYNAME, TI_GETPEERNAME */
72
73 #include <c2/audit.h>
74
75 #include <inet/common.h>
76 #include <inet/ip.h>
77 #include <inet/ip6.h>
78 #include <inet/tcp.h>
79 #include <inet/udp_impl.h>
80
81 #include <sys/zone.h>
82
83 #include <fs/sockfs/sockcommon.h>
84 #include <fs/sockfs/socktpi.h>
85 #include <fs/sockfs/socktpi_impl.h>
86
87 /*
88 * Possible failures when memory can't be allocated. The documented behavior:
89 *
90 * 5.5: 4.X: XNET:
91 * accept: ENOMEM/ENOSR/EINTR - (EINTR) ENOMEM/ENOBUFS/ENOSR/
92 * EINTR
93 * (4.X does not document EINTR but returns it)
94 * bind: ENOSR - ENOBUFS/ENOSR
95 * connect: EINTR EINTR ENOBUFS/ENOSR/EINTR
96 * getpeername: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
97 * getsockname: ENOMEM/ENOSR ENOBUFS (-) ENOBUFS/ENOSR
98 * (4.X getpeername and getsockname do not fail in practice)
99 * getsockopt: ENOMEM/ENOSR - ENOBUFS/ENOSR
100 * listen: - - ENOBUFS
101 * recv: ENOMEM/ENOSR/EINTR EINTR ENOBUFS/ENOMEM/ENOSR/
102 * EINTR
103 * send: ENOMEM/ENOSR/EINTR ENOBUFS/EINTR ENOBUFS/ENOMEM/ENOSR/
104 * EINTR
105 * setsockopt: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
106 * shutdown: ENOMEM/ENOSR - ENOBUFS/ENOSR
107 * socket: ENOMEM/ENOSR ENOBUFS ENOBUFS/ENOMEM/ENOSR
108 * socketpair: ENOMEM/ENOSR - ENOBUFS/ENOMEM/ENOSR
109 *
110 * Resolution. When allocation fails:
111 * recv: return EINTR
112 * send: return EINTR
113 * connect, accept: EINTR
114 * bind, listen, shutdown (unbind, unix_close, disconnect): sleep
115 * socket, socketpair: ENOBUFS
116 * getpeername, getsockname: sleep
117 * getsockopt, setsockopt: sleep
118 */
119
120 #ifdef SOCK_TEST
121 /*
122 * Variables that make sockfs do something other than the standard TPI
123 * for the AF_INET transports.
124 *
125 * solisten_tpi_tcp:
126 * TCP can handle a O_T_BIND_REQ with an increased backlog even though
127 * the transport is already bound. This is needed to avoid loosing the
128 * port number should listen() do a T_UNBIND_REQ followed by a
129 * O_T_BIND_REQ.
130 *
131 * soconnect_tpi_udp:
132 * UDP and ICMP can handle a T_CONN_REQ.
133 * This is needed to make the sequence of connect(), getsockname()
134 * return the local IP address used to send packets to the connected to
135 * destination.
136 *
137 * soconnect_tpi_tcp:
138 * TCP can handle a T_CONN_REQ without seeing a O_T_BIND_REQ.
139 * Set this to non-zero to send TPI conformant messages to TCP in this
140 * respect. This is a performance optimization.
141 *
142 * soaccept_tpi_tcp:
143 * TCP can handle a T_CONN_REQ without the acceptor being bound.
144 * This is a performance optimization that has been picked up in XTI.
145 *
146 * soaccept_tpi_multioptions:
147 * When inheriting SOL_SOCKET options from the listener to the accepting
148 * socket send them as a single message for AF_INET{,6}.
149 */
150 int solisten_tpi_tcp = 0;
151 int soconnect_tpi_udp = 0;
152 int soconnect_tpi_tcp = 0;
153 int soaccept_tpi_tcp = 0;
154 int soaccept_tpi_multioptions = 1;
155 #else /* SOCK_TEST */
156 #define soconnect_tpi_tcp 0
157 #define soconnect_tpi_udp 0
158 #define solisten_tpi_tcp 0
159 #define soaccept_tpi_tcp 0
160 #define soaccept_tpi_multioptions 1
161 #endif /* SOCK_TEST */
162
163 #ifdef SOCK_TEST
164 extern int do_useracc;
165 extern clock_t sock_test_timelimit;
166 #endif /* SOCK_TEST */
167
168 extern uint32_t ucredsize;
169
170 /*
171 * Some X/Open added checks might have to be backed out to keep SunOS 4.X
172 * applications working. Turn on this flag to disable these checks.
173 */
174 int xnet_skip_checks = 0;
175 int xnet_check_print = 0;
176 int xnet_truncate_print = 0;
177
178 static void sotpi_destroy(struct sonode *);
179 static struct sonode *sotpi_create(struct sockparams *, int, int, int, int,
180 int, int *, cred_t *cr);
181
182 static boolean_t sotpi_info_create(struct sonode *, int);
183 static void sotpi_info_init(struct sonode *);
184 static void sotpi_info_fini(struct sonode *);
185 static void sotpi_info_destroy(struct sonode *);
186
187 /*
188 * Do direct function call to the transport layer below; this would
189 * also allow the transport to utilize read-side synchronous stream
190 * interface if necessary. This is a /etc/system tunable that must
191 * not be modified on a running system. By default this is enabled
192 * for performance reasons and may be disabled for debugging purposes.
193 */
194 boolean_t socktpi_direct = B_TRUE;
195
196 static struct kmem_cache *socktpi_cache, *socktpi_unix_cache;
197
198 extern void sigintr(k_sigset_t *, int);
199 extern void sigunintr(k_sigset_t *);
200
201 static int sotpi_unbind(struct sonode *, int);
202
203 /* TPI sockfs sonode operations */
204 int sotpi_init(struct sonode *, struct sonode *, struct cred *,
205 int);
206 static int sotpi_accept(struct sonode *, int, struct cred *,
207 struct sonode **);
208 static int sotpi_bind(struct sonode *, struct sockaddr *, socklen_t,
209 int, struct cred *);
210 static int sotpi_listen(struct sonode *, int, struct cred *);
211 static int sotpi_connect(struct sonode *, struct sockaddr *,
212 socklen_t, int, int, struct cred *);
213 extern int sotpi_recvmsg(struct sonode *, struct nmsghdr *,
214 struct uio *, struct cred *);
215 static int sotpi_sendmsg(struct sonode *, struct nmsghdr *,
216 struct uio *, struct cred *);
217 static int sotpi_sendmblk(struct sonode *, struct nmsghdr *, int,
218 struct cred *, mblk_t **);
219 static int sosend_dgramcmsg(struct sonode *, struct sockaddr *, socklen_t,
220 struct uio *, void *, t_uscalar_t, int);
221 static int sodgram_direct(struct sonode *, struct sockaddr *,
222 socklen_t, struct uio *, int);
223 extern int sotpi_getpeername(struct sonode *, struct sockaddr *,
224 socklen_t *, boolean_t, struct cred *);
225 static int sotpi_getsockname(struct sonode *, struct sockaddr *,
226 socklen_t *, struct cred *);
227 static int sotpi_shutdown(struct sonode *, int, struct cred *);
228 extern int sotpi_getsockopt(struct sonode *, int, int, void *,
229 socklen_t *, int, struct cred *);
230 extern int sotpi_setsockopt(struct sonode *, int, int, const void *,
231 socklen_t, struct cred *);
232 static int sotpi_ioctl(struct sonode *, int, intptr_t, int, struct cred *,
233 int32_t *);
234 static int socktpi_plumbioctl(struct vnode *, int, intptr_t, int,
235 struct cred *, int32_t *);
236 static int sotpi_poll(struct sonode *, short, int, short *,
237 struct pollhead **);
238 static int sotpi_close(struct sonode *, int, struct cred *);
239
240 static int i_sotpi_info_constructor(sotpi_info_t *);
241 static void i_sotpi_info_destructor(sotpi_info_t *);
242
243 sonodeops_t sotpi_sonodeops = {
244 sotpi_init, /* sop_init */
245 sotpi_accept, /* sop_accept */
246 sotpi_bind, /* sop_bind */
247 sotpi_listen, /* sop_listen */
248 sotpi_connect, /* sop_connect */
249 sotpi_recvmsg, /* sop_recvmsg */
250 sotpi_sendmsg, /* sop_sendmsg */
251 sotpi_sendmblk, /* sop_sendmblk */
252 sotpi_getpeername, /* sop_getpeername */
253 sotpi_getsockname, /* sop_getsockname */
254 sotpi_shutdown, /* sop_shutdown */
255 sotpi_getsockopt, /* sop_getsockopt */
256 sotpi_setsockopt, /* sop_setsockopt */
257 sotpi_ioctl, /* sop_ioctl */
258 sotpi_poll, /* sop_poll */
259 sotpi_close, /* sop_close */
260 };
261
262 /*
263 * Return a TPI socket vnode.
264 *
265 * Note that sockets assume that the driver will clone (either itself
266 * or by using the clone driver) i.e. a socket() call will always
267 * result in a new vnode being created.
268 */
269
270 /*
271 * Common create code for socket and accept. If tso is set the values
272 * from that node is used instead of issuing a T_INFO_REQ.
273 */
274
275 /* ARGSUSED */
276 static struct sonode *
sotpi_create(struct sockparams * sp,int family,int type,int protocol,int version,int sflags,int * errorp,cred_t * cr)277 sotpi_create(struct sockparams *sp, int family, int type, int protocol,
278 int version, int sflags, int *errorp, cred_t *cr)
279 {
280 struct sonode *so;
281 kmem_cache_t *cp;
282
283 ASSERT(sp->sp_sdev_info.sd_vnode != NULL);
284
285 if (family == AF_NCA) {
286 /*
287 * The request is for an NCA socket so for NL7C use the
288 * INET domain instead and mark NL7C_AF_NCA below.
289 */
290 family = AF_INET;
291 /*
292 * NL7C is not supported in the non-global zone,
293 * we enforce this restriction here.
294 */
295 if (getzoneid() != GLOBAL_ZONEID) {
296 *errorp = ENOTSUP;
297 return (NULL);
298 }
299 }
300
301 /*
302 * to be compatible with old tpi socket implementation ignore
303 * sleep flag (sflags) passed in
304 */
305 cp = (family == AF_UNIX) ? socktpi_unix_cache : socktpi_cache;
306 so = kmem_cache_alloc(cp, KM_SLEEP);
307 if (so == NULL) {
308 *errorp = ENOMEM;
309 return (NULL);
310 }
311
312 sonode_init(so, sp, family, type, protocol, &sotpi_sonodeops);
313 sotpi_info_init(so);
314
315 if (version == SOV_DEFAULT)
316 version = so_default_version;
317
318 so->so_version = (short)version;
319 *errorp = 0;
320
321 return (so);
322 }
323
324 static void
sotpi_destroy(struct sonode * so)325 sotpi_destroy(struct sonode *so)
326 {
327 kmem_cache_t *cp;
328 struct sockparams *origsp;
329
330 /*
331 * If there is a new dealloc function (ie. smod_destroy_func),
332 * then it should check the correctness of the ops.
333 */
334
335 ASSERT(so->so_ops == &sotpi_sonodeops);
336
337 origsp = SOTOTPI(so)->sti_orig_sp;
338
339 sotpi_info_fini(so);
340
341 if (so->so_state & SS_FALLBACK_COMP) {
342 /*
343 * A fallback happend, which means that a sotpi_info_t struct
344 * was allocated (as opposed to being allocated from the TPI
345 * sonode cache. Therefore we explicitly free the struct
346 * here.
347 */
348 sotpi_info_destroy(so);
349 ASSERT(origsp != NULL);
350
351 origsp->sp_smod_info->smod_sock_destroy_func(so);
352 SOCKPARAMS_DEC_REF(origsp);
353 } else {
354 sonode_fini(so);
355 cp = (so->so_family == AF_UNIX) ? socktpi_unix_cache :
356 socktpi_cache;
357 kmem_cache_free(cp, so);
358 }
359 }
360
361 /* ARGSUSED1 */
362 int
sotpi_init(struct sonode * so,struct sonode * tso,struct cred * cr,int flags)363 sotpi_init(struct sonode *so, struct sonode *tso, struct cred *cr, int flags)
364 {
365 major_t maj;
366 dev_t newdev;
367 struct vnode *vp;
368 int error = 0;
369 struct stdata *stp;
370
371 sotpi_info_t *sti = SOTOTPI(so);
372
373 dprint(1, ("sotpi_init()\n"));
374
375 /*
376 * over write the sleep flag passed in but that is ok
377 * as tpi socket does not honor sleep flag.
378 */
379 flags |= FREAD|FWRITE;
380
381 /*
382 * Record in so_flag that it is a clone.
383 */
384 if (getmajor(sti->sti_dev) == clone_major)
385 so->so_flag |= SOCLONE;
386
387 if ((so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM) &&
388 (so->so_family == AF_INET || so->so_family == AF_INET6) &&
389 (so->so_protocol == IPPROTO_TCP || so->so_protocol == IPPROTO_UDP ||
390 so->so_protocol == IPPROTO_IP)) {
391 /* Tell tcp or udp that it's talking to sockets */
392 flags |= SO_SOCKSTR;
393
394 /*
395 * Here we indicate to socktpi_open() our attempt to
396 * make direct calls between sockfs and transport.
397 * The final decision is left to socktpi_open().
398 */
399 sti->sti_direct = 1;
400
401 ASSERT(so->so_type != SOCK_DGRAM || tso == NULL);
402 if (so->so_type == SOCK_STREAM && tso != NULL) {
403 if (SOTOTPI(tso)->sti_direct) {
404 /*
405 * Inherit sti_direct from listener and pass
406 * SO_ACCEPTOR open flag to tcp, indicating
407 * that this is an accept fast-path instance.
408 */
409 flags |= SO_ACCEPTOR;
410 } else {
411 /*
412 * sti_direct is not set on listener, meaning
413 * that the listener has been converted from
414 * a socket to a stream. Ensure that the
415 * acceptor inherits these settings.
416 */
417 sti->sti_direct = 0;
418 flags &= ~SO_SOCKSTR;
419 }
420 }
421 }
422
423 /*
424 * Tell local transport that it is talking to sockets.
425 */
426 if (so->so_family == AF_UNIX) {
427 flags |= SO_SOCKSTR;
428 }
429
430 vp = SOTOV(so);
431 newdev = vp->v_rdev;
432 maj = getmajor(newdev);
433 ASSERT(STREAMSTAB(maj));
434
435 error = stropen(vp, &newdev, flags, cr);
436
437 stp = vp->v_stream;
438 if (error == 0) {
439 if (so->so_flag & SOCLONE)
440 ASSERT(newdev != vp->v_rdev);
441 mutex_enter(&so->so_lock);
442 sti->sti_dev = newdev;
443 vp->v_rdev = newdev;
444 mutex_exit(&so->so_lock);
445
446 if (stp->sd_flag & STRISTTY) {
447 /*
448 * this is a post SVR4 tty driver - a socket can not
449 * be a controlling terminal. Fail the open.
450 */
451 (void) sotpi_close(so, flags, cr);
452 return (ENOTTY); /* XXX */
453 }
454
455 ASSERT(stp->sd_wrq != NULL);
456 sti->sti_provinfo = tpi_findprov(stp->sd_wrq);
457
458 /*
459 * If caller is interested in doing direct function call
460 * interface to/from transport module, probe the module
461 * directly beneath the streamhead to see if it qualifies.
462 *
463 * We turn off the direct interface when qualifications fail.
464 * In the acceptor case, we simply turn off the sti_direct
465 * flag on the socket. We do the fallback after the accept
466 * has completed, before the new socket is returned to the
467 * application.
468 */
469 if (sti->sti_direct) {
470 queue_t *tq = stp->sd_wrq->q_next;
471
472 /*
473 * sti_direct is currently supported and tested
474 * only for tcp/udp; this is the main reason to
475 * have the following assertions.
476 */
477 ASSERT(so->so_family == AF_INET ||
478 so->so_family == AF_INET6);
479 ASSERT(so->so_protocol == IPPROTO_UDP ||
480 so->so_protocol == IPPROTO_TCP ||
481 so->so_protocol == IPPROTO_IP);
482 ASSERT(so->so_type == SOCK_DGRAM ||
483 so->so_type == SOCK_STREAM);
484
485 /*
486 * Abort direct call interface if the module directly
487 * underneath the stream head is not defined with the
488 * _D_DIRECT flag. This could happen in the tcp or
489 * udp case, when some other module is autopushed
490 * above it, or for some reasons the expected module
491 * isn't purely D_MP (which is the main requirement).
492 */
493 if (!socktpi_direct || !(tq->q_flag & _QDIRECT) ||
494 !(_OTHERQ(tq)->q_flag & _QDIRECT)) {
495 int rval;
496
497 /* Continue on without direct calls */
498 sti->sti_direct = 0;
499
500 /*
501 * Cannot issue ioctl on fallback socket since
502 * there is no conn associated with the queue.
503 * The fallback downcall will notify the proto
504 * of the change.
505 */
506 if (!(flags & SO_ACCEPTOR) &&
507 !(flags & SO_FALLBACK)) {
508 if ((error = strioctl(vp,
509 _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
510 cr, &rval)) != 0) {
511 (void) sotpi_close(so, flags,
512 cr);
513 return (error);
514 }
515 }
516 }
517 }
518
519 if (flags & SO_FALLBACK) {
520 /*
521 * The stream created does not have a conn.
522 * do stream set up after conn has been assigned
523 */
524 return (error);
525 }
526 error = so_strinit(so, tso);
527 if (error != 0) {
528 (void) sotpi_close(so, flags, cr);
529 return (error);
530 }
531
532 /* Enable sendfile() on AF_UNIX streams */
533 if (so->so_family == AF_UNIX && so->so_type == SOCK_STREAM) {
534 mutex_enter(&so->so_lock);
535 so->so_mode |= SM_SENDFILESUPP;
536 mutex_exit(&so->so_lock);
537 }
538
539 /* Wildcard */
540 if (so->so_protocol != so->so_sockparams->sp_protocol) {
541 int protocol = so->so_protocol;
542 /*
543 * Issue SO_PROTOTYPE setsockopt.
544 */
545 error = sotpi_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
546 &protocol, (t_uscalar_t)sizeof (protocol), cr);
547 if (error != 0) {
548 (void) sotpi_close(so, flags, cr);
549 /*
550 * Setsockopt often fails with ENOPROTOOPT but
551 * socket() should fail with
552 * EPROTONOSUPPORT/EPROTOTYPE.
553 */
554 return (EPROTONOSUPPORT);
555 }
556 }
557
558 } else {
559 /*
560 * While the same socket can not be reopened (unlike specfs)
561 * the stream head sets STREOPENFAIL when the autopush fails.
562 */
563 if ((stp != NULL) &&
564 (stp->sd_flag & STREOPENFAIL)) {
565 /*
566 * Open failed part way through.
567 */
568 mutex_enter(&stp->sd_lock);
569 stp->sd_flag &= ~STREOPENFAIL;
570 mutex_exit(&stp->sd_lock);
571 (void) sotpi_close(so, flags, cr);
572 return (error);
573 /*NOTREACHED*/
574 }
575 ASSERT(stp == NULL);
576 }
577 TRACE_4(TR_FAC_SOCKFS, TR_SOCKFS_OPEN,
578 "sockfs open:maj %d vp %p so %p error %d",
579 maj, vp, so, error);
580 return (error);
581 }
582
583 /*
584 * Bind the socket to an unspecified address in sockfs only.
585 * Used for TCP/UDP transports where we know that the O_T_BIND_REQ isn't
586 * required in all cases.
587 */
588 static void
so_automatic_bind(struct sonode * so)589 so_automatic_bind(struct sonode *so)
590 {
591 sotpi_info_t *sti = SOTOTPI(so);
592 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
593
594 ASSERT(MUTEX_HELD(&so->so_lock));
595 ASSERT(!(so->so_state & SS_ISBOUND));
596 ASSERT(sti->sti_unbind_mp);
597
598 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
599 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
600 sti->sti_laddr_sa->sa_family = so->so_family;
601 so->so_state |= SS_ISBOUND;
602 }
603
604
605 /*
606 * bind the socket.
607 *
608 * If the socket is already bound and none of _SOBIND_SOCKBSD or _SOBIND_XPG4_2
609 * are passed in we allow rebinding. Note that for backwards compatibility
610 * even "svr4" sockets pass in _SOBIND_SOCKBSD/SOV_SOCKBSD to sobind/bind.
611 * Thus the rebinding code is currently not executed.
612 *
613 * The constraints for rebinding are:
614 * - it is a SOCK_DGRAM, or
615 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
616 * and no listen() has been done.
617 * This rebinding code was added based on some language in the XNET book
618 * about not returning EINVAL it the protocol allows rebinding. However,
619 * this language is not present in the Posix socket draft. Thus maybe the
620 * rebinding logic should be deleted from the source.
621 *
622 * A null "name" can be used to unbind the socket if:
623 * - it is a SOCK_DGRAM, or
624 * - it is a SOCK_STREAM/SOCK_SEQPACKET that has not been connected
625 * and no listen() has been done.
626 */
627 /* ARGSUSED */
628 static int
sotpi_bindlisten(struct sonode * so,struct sockaddr * name,socklen_t namelen,int backlog,int flags,struct cred * cr)629 sotpi_bindlisten(struct sonode *so, struct sockaddr *name,
630 socklen_t namelen, int backlog, int flags, struct cred *cr)
631 {
632 struct T_bind_req bind_req;
633 struct T_bind_ack *bind_ack;
634 int error = 0;
635 mblk_t *mp;
636 void *addr;
637 t_uscalar_t addrlen;
638 int unbind_on_err = 1;
639 boolean_t clear_acceptconn_on_err = B_FALSE;
640 boolean_t restore_backlog_on_err = B_FALSE;
641 int save_so_backlog = 0;
642 t_scalar_t PRIM_type = O_T_BIND_REQ;
643 boolean_t tcp_udp_xport;
644 sotpi_info_t *sti = SOTOTPI(so);
645
646 dprintso(so, 1, ("sotpi_bindlisten(%p, %p, %d, %d, 0x%x) %s\n",
647 (void *)so, (void *)name, namelen, backlog, flags,
648 pr_state(so->so_state, so->so_mode)));
649
650 tcp_udp_xport = so->so_type == SOCK_STREAM || so->so_type == SOCK_DGRAM;
651
652 if (!(flags & _SOBIND_LOCK_HELD)) {
653 mutex_enter(&so->so_lock);
654 so_lock_single(so); /* Set SOLOCKED */
655 } else {
656 ASSERT(MUTEX_HELD(&so->so_lock));
657 ASSERT(so->so_flag & SOLOCKED);
658 }
659
660 /*
661 * Make sure that there is a preallocated unbind_req message
662 * before binding. This message allocated when the socket is
663 * created but it might be have been consumed.
664 */
665 if (sti->sti_unbind_mp == NULL) {
666 dprintso(so, 1, ("sobind: allocating unbind_req\n"));
667 /* NOTE: holding so_lock while sleeping */
668 sti->sti_unbind_mp =
669 soallocproto(sizeof (struct T_unbind_req), _ALLOC_SLEEP,
670 cr);
671 }
672
673 if (flags & _SOBIND_REBIND) {
674 /*
675 * Called from solisten after doing an sotpi_unbind() or
676 * potentially without the unbind (latter for AF_INET{,6}).
677 */
678 ASSERT(name == NULL && namelen == 0);
679
680 if (so->so_family == AF_UNIX) {
681 ASSERT(sti->sti_ux_bound_vp);
682 addr = &sti->sti_ux_laddr;
683 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
684 dprintso(so, 1, ("sobind rebind UNIX: addrlen %d, "
685 "addr 0x%p, vp %p\n",
686 addrlen,
687 (void *)((struct so_ux_addr *)addr)->soua_vp,
688 (void *)sti->sti_ux_bound_vp));
689 } else {
690 addr = sti->sti_laddr_sa;
691 addrlen = (t_uscalar_t)sti->sti_laddr_len;
692 }
693 } else if (flags & _SOBIND_UNSPEC) {
694 ASSERT(name == NULL && namelen == 0);
695
696 /*
697 * The caller checked SS_ISBOUND but not necessarily
698 * under so_lock
699 */
700 if (so->so_state & SS_ISBOUND) {
701 /* No error */
702 goto done;
703 }
704
705 /* Set an initial local address */
706 switch (so->so_family) {
707 case AF_UNIX:
708 /*
709 * Use an address with same size as struct sockaddr
710 * just like BSD.
711 */
712 sti->sti_laddr_len =
713 (socklen_t)sizeof (struct sockaddr);
714 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
715 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
716 sti->sti_laddr_sa->sa_family = so->so_family;
717
718 /*
719 * Pass down an address with the implicit bind
720 * magic number and the rest all zeros.
721 * The transport will return a unique address.
722 */
723 sti->sti_ux_laddr.soua_vp = NULL;
724 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_IMPLICIT;
725 addr = &sti->sti_ux_laddr;
726 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
727 break;
728
729 case AF_INET:
730 case AF_INET6:
731 /*
732 * An unspecified bind in TPI has a NULL address.
733 * Set the address in sockfs to have the sa_family.
734 */
735 sti->sti_laddr_len = (so->so_family == AF_INET) ?
736 (socklen_t)sizeof (sin_t) :
737 (socklen_t)sizeof (sin6_t);
738 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
739 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
740 sti->sti_laddr_sa->sa_family = so->so_family;
741 addr = NULL;
742 addrlen = 0;
743 break;
744
745 default:
746 /*
747 * An unspecified bind in TPI has a NULL address.
748 * Set the address in sockfs to be zero length.
749 *
750 * Can not assume there is a sa_family for all
751 * protocol families. For example, AF_X25 does not
752 * have a family field.
753 */
754 bzero(sti->sti_laddr_sa, sti->sti_laddr_len);
755 sti->sti_laddr_len = 0; /* XXX correct? */
756 addr = NULL;
757 addrlen = 0;
758 break;
759 }
760
761 } else {
762 if (so->so_state & SS_ISBOUND) {
763 /*
764 * If it is ok to rebind the socket, first unbind
765 * with the transport. A rebind to the NULL address
766 * is interpreted as an unbind.
767 * Note that a bind to NULL in BSD does unbind the
768 * socket but it fails with EINVAL.
769 * Note that regular sockets set SOV_SOCKBSD i.e.
770 * _SOBIND_SOCKBSD gets set here hence no type of
771 * socket does currently allow rebinding.
772 *
773 * If the name is NULL just do an unbind.
774 */
775 if (flags & (_SOBIND_SOCKBSD|_SOBIND_XPG4_2) &&
776 name != NULL) {
777 error = EINVAL;
778 unbind_on_err = 0;
779 eprintsoline(so, error);
780 goto done;
781 }
782 if ((so->so_mode & SM_CONNREQUIRED) &&
783 (so->so_state & SS_CANTREBIND)) {
784 error = EINVAL;
785 unbind_on_err = 0;
786 eprintsoline(so, error);
787 goto done;
788 }
789 error = sotpi_unbind(so, 0);
790 if (error) {
791 eprintsoline(so, error);
792 goto done;
793 }
794 ASSERT(!(so->so_state & SS_ISBOUND));
795 if (name == NULL) {
796 so->so_state &=
797 ~(SS_ISCONNECTED|SS_ISCONNECTING);
798 goto done;
799 }
800 }
801
802 /* X/Open requires this check */
803 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
804 if (xnet_check_print) {
805 printf("sockfs: X/Open bind state check "
806 "caused EINVAL\n");
807 }
808 error = EINVAL;
809 goto done;
810 }
811
812 switch (so->so_family) {
813 case AF_UNIX:
814 /*
815 * All AF_UNIX addresses are nul terminated
816 * when copied (copyin_name) in so the minimum
817 * length is 3 bytes.
818 */
819 if (name == NULL ||
820 (ssize_t)namelen <= sizeof (short) + 1) {
821 error = EISDIR;
822 eprintsoline(so, error);
823 goto done;
824 }
825 /*
826 * Verify so_family matches the bound family.
827 * BSD does not check this for AF_UNIX resulting
828 * in funny mknods.
829 */
830 if (name->sa_family != so->so_family) {
831 error = EAFNOSUPPORT;
832 goto done;
833 }
834 break;
835 case AF_INET:
836 if (name == NULL) {
837 error = EINVAL;
838 eprintsoline(so, error);
839 goto done;
840 }
841 if ((size_t)namelen != sizeof (sin_t)) {
842 error = name->sa_family != so->so_family ?
843 EAFNOSUPPORT : EINVAL;
844 eprintsoline(so, error);
845 goto done;
846 }
847 if ((flags & _SOBIND_XPG4_2) &&
848 (name->sa_family != so->so_family)) {
849 /*
850 * This check has to be made for X/Open
851 * sockets however application failures have
852 * been observed when it is applied to
853 * all sockets.
854 */
855 error = EAFNOSUPPORT;
856 eprintsoline(so, error);
857 goto done;
858 }
859 /*
860 * Force a zero sa_family to match so_family.
861 *
862 * Some programs like inetd(8) don't set the
863 * family field. Other programs leave
864 * sin_family set to garbage - SunOS 4.X does
865 * not check the family field on a bind.
866 * We use the family field that
867 * was passed in to the socket() call.
868 */
869 name->sa_family = so->so_family;
870 break;
871
872 case AF_INET6: {
873 #ifdef DEBUG
874 sin6_t *sin6 = (sin6_t *)name;
875 #endif /* DEBUG */
876
877 if (name == NULL) {
878 error = EINVAL;
879 eprintsoline(so, error);
880 goto done;
881 }
882 if ((size_t)namelen != sizeof (sin6_t)) {
883 error = name->sa_family != so->so_family ?
884 EAFNOSUPPORT : EINVAL;
885 eprintsoline(so, error);
886 goto done;
887 }
888 if (name->sa_family != so->so_family) {
889 /*
890 * With IPv6 we require the family to match
891 * unlike in IPv4.
892 */
893 error = EAFNOSUPPORT;
894 eprintsoline(so, error);
895 goto done;
896 }
897 #ifdef DEBUG
898 /*
899 * Verify that apps don't forget to clear
900 * sin6_scope_id etc
901 */
902 if (sin6->sin6_scope_id != 0 &&
903 !IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) {
904 zcmn_err(getzoneid(), CE_WARN,
905 "bind with uninitialized sin6_scope_id "
906 "(%d) on socket. Pid = %d\n",
907 (int)sin6->sin6_scope_id,
908 (int)curproc->p_pid);
909 }
910 if (sin6->__sin6_src_id != 0) {
911 zcmn_err(getzoneid(), CE_WARN,
912 "bind with uninitialized __sin6_src_id "
913 "(%d) on socket. Pid = %d\n",
914 (int)sin6->__sin6_src_id,
915 (int)curproc->p_pid);
916 }
917 #endif /* DEBUG */
918 break;
919 }
920 default:
921 /*
922 * Don't do any length or sa_family check to allow
923 * non-sockaddr style addresses.
924 */
925 if (name == NULL) {
926 error = EINVAL;
927 eprintsoline(so, error);
928 goto done;
929 }
930 break;
931 }
932
933 if (namelen > (t_uscalar_t)sti->sti_laddr_maxlen) {
934 error = ENAMETOOLONG;
935 eprintsoline(so, error);
936 goto done;
937 }
938 /*
939 * Save local address.
940 */
941 sti->sti_laddr_len = (socklen_t)namelen;
942 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
943 bcopy(name, sti->sti_laddr_sa, namelen);
944
945 addr = sti->sti_laddr_sa;
946 addrlen = (t_uscalar_t)sti->sti_laddr_len;
947 switch (so->so_family) {
948 case AF_INET6:
949 case AF_INET:
950 break;
951 case AF_UNIX: {
952 struct sockaddr_un *soun =
953 (struct sockaddr_un *)sti->sti_laddr_sa;
954 struct vnode *vp, *rvp;
955 struct vattr vattr;
956
957 ASSERT(sti->sti_ux_bound_vp == NULL);
958 /*
959 * Create vnode for the specified path name.
960 * Keep vnode held with a reference in sti_ux_bound_vp.
961 * Use the vnode pointer as the address used in the
962 * bind with the transport.
963 *
964 * Use the same mode as in BSD. In particular this does
965 * not observe the umask.
966 */
967 /* MAXPATHLEN + soun_family + nul termination */
968 if (sti->sti_laddr_len >
969 (socklen_t)(MAXPATHLEN + sizeof (short) + 1)) {
970 error = ENAMETOOLONG;
971 eprintsoline(so, error);
972 goto done;
973 }
974 vattr.va_type = VSOCK;
975 vattr.va_mode = 0777 & ~PTOU(curproc)->u_cmask;
976 vattr.va_mask = AT_TYPE|AT_MODE;
977 /* NOTE: holding so_lock */
978 error = vn_create(soun->sun_path, UIO_SYSSPACE, &vattr,
979 EXCL, 0, &vp, CRMKNOD, 0, 0);
980 if (error) {
981 if (error == EEXIST)
982 error = EADDRINUSE;
983 eprintsoline(so, error);
984 goto done;
985 }
986 /*
987 * Establish pointer from the underlying filesystem
988 * vnode to the socket node.
989 * sti_ux_bound_vp and v_stream->sd_vnode form the
990 * cross-linkage between the underlying filesystem
991 * node and the socket node.
992 */
993
994 if ((VOP_REALVP(vp, &rvp, NULL) == 0) && (vp != rvp)) {
995 VN_HOLD(rvp);
996 VN_RELE(vp);
997 vp = rvp;
998 }
999
1000 ASSERT(SOTOV(so)->v_stream);
1001 mutex_enter(&vp->v_lock);
1002 vp->v_stream = SOTOV(so)->v_stream;
1003 sti->sti_ux_bound_vp = vp;
1004 mutex_exit(&vp->v_lock);
1005
1006 /*
1007 * Use the vnode pointer value as a unique address
1008 * (together with the magic number to avoid conflicts
1009 * with implicit binds) in the transport provider.
1010 */
1011 sti->sti_ux_laddr.soua_vp =
1012 (void *)sti->sti_ux_bound_vp;
1013 sti->sti_ux_laddr.soua_magic = SOU_MAGIC_EXPLICIT;
1014 addr = &sti->sti_ux_laddr;
1015 addrlen = (t_uscalar_t)sizeof (sti->sti_ux_laddr);
1016 dprintso(so, 1, ("sobind UNIX: addrlen %d, addr %p\n",
1017 addrlen,
1018 (void *)((struct so_ux_addr *)addr)->soua_vp));
1019 break;
1020 }
1021 } /* end switch (so->so_family) */
1022 }
1023
1024 /*
1025 * set SS_ACCEPTCONN before sending down O_T_BIND_REQ since
1026 * the transport can start passing up T_CONN_IND messages
1027 * as soon as it receives the bind req and strsock_proto()
1028 * insists that SS_ACCEPTCONN is set when processing T_CONN_INDs.
1029 */
1030 if (flags & _SOBIND_LISTEN) {
1031 if ((so->so_state & SS_ACCEPTCONN) == 0)
1032 clear_acceptconn_on_err = B_TRUE;
1033 save_so_backlog = so->so_backlog;
1034 restore_backlog_on_err = B_TRUE;
1035 so->so_state |= SS_ACCEPTCONN;
1036 so->so_backlog = backlog;
1037 }
1038
1039 /*
1040 * We send a T_BIND_REQ for TCP/UDP since we know it supports it,
1041 * for other transports we will send in a O_T_BIND_REQ.
1042 */
1043 if (tcp_udp_xport &&
1044 (so->so_family == AF_INET || so->so_family == AF_INET6))
1045 PRIM_type = T_BIND_REQ;
1046
1047 bind_req.PRIM_type = PRIM_type;
1048 bind_req.ADDR_length = addrlen;
1049 bind_req.ADDR_offset = (t_scalar_t)sizeof (bind_req);
1050 bind_req.CONIND_number = backlog;
1051 /* NOTE: holding so_lock while sleeping */
1052 mp = soallocproto2(&bind_req, sizeof (bind_req),
1053 addr, addrlen, 0, _ALLOC_SLEEP, cr);
1054 sti->sti_laddr_valid = 0;
1055
1056 /* Done using sti_laddr_sa - can drop the lock */
1057 mutex_exit(&so->so_lock);
1058
1059 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1060 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1061 if (error) {
1062 eprintsoline(so, error);
1063 mutex_enter(&so->so_lock);
1064 goto done;
1065 }
1066
1067 mutex_enter(&so->so_lock);
1068 error = sowaitprim(so, PRIM_type, T_BIND_ACK,
1069 (t_uscalar_t)sizeof (*bind_ack), &mp, 0);
1070 if (error) {
1071 eprintsoline(so, error);
1072 goto done;
1073 }
1074 ASSERT(mp);
1075 /*
1076 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1077 * strsock_proto while the lock was dropped above, the bind
1078 * is allowed to complete.
1079 */
1080
1081 /* Mark as bound. This will be undone if we detect errors below. */
1082 if (flags & _SOBIND_NOXLATE) {
1083 ASSERT(so->so_family == AF_UNIX);
1084 sti->sti_faddr_noxlate = 1;
1085 }
1086 ASSERT(!(so->so_state & SS_ISBOUND) || (flags & _SOBIND_REBIND));
1087 so->so_state |= SS_ISBOUND;
1088 ASSERT(sti->sti_unbind_mp);
1089
1090 /* note that we've already set SS_ACCEPTCONN above */
1091
1092 /*
1093 * Recompute addrlen - an unspecied bind sent down an
1094 * address of length zero but we expect the appropriate length
1095 * in return.
1096 */
1097 addrlen = (t_uscalar_t)(so->so_family == AF_UNIX ?
1098 sizeof (sti->sti_ux_laddr) : sti->sti_laddr_len);
1099
1100 bind_ack = (struct T_bind_ack *)mp->b_rptr;
1101 /*
1102 * The alignment restriction is really too strict but
1103 * we want enough alignment to inspect the fields of
1104 * a sockaddr_in.
1105 */
1106 addr = sogetoff(mp, bind_ack->ADDR_offset,
1107 bind_ack->ADDR_length,
1108 __TPI_ALIGN_SIZE);
1109 if (addr == NULL) {
1110 freemsg(mp);
1111 error = EPROTO;
1112 eprintsoline(so, error);
1113 goto done;
1114 }
1115 if (!(flags & _SOBIND_UNSPEC)) {
1116 /*
1117 * Verify that the transport didn't return something we
1118 * did not want e.g. an address other than what we asked for.
1119 *
1120 * NOTE: These checks would go away if/when we switch to
1121 * using the new TPI (in which the transport would fail
1122 * the request instead of assigning a different address).
1123 *
1124 * NOTE2: For protocols that we don't know (i.e. any
1125 * other than AF_INET6, AF_INET and AF_UNIX), we
1126 * cannot know if the transport should be expected to
1127 * return the same address as that requested.
1128 *
1129 * NOTE3: For AF_INET and AF_INET6, TCP/UDP, we send
1130 * down a T_BIND_REQ. We use O_T_BIND_REQ for others.
1131 *
1132 * For example, in the case of netatalk it may be
1133 * inappropriate for the transport to return the
1134 * requested address (as it may have allocated a local
1135 * port number in behaviour similar to that of an
1136 * AF_INET bind request with a port number of zero).
1137 *
1138 * Given the definition of O_T_BIND_REQ, where the
1139 * transport may bind to an address other than the
1140 * requested address, it's not possible to determine
1141 * whether a returned address that differs from the
1142 * requested address is a reason to fail (because the
1143 * requested address was not available) or succeed
1144 * (because the transport allocated an appropriate
1145 * address and/or port).
1146 *
1147 * sockfs currently requires that the transport return
1148 * the requested address in the T_BIND_ACK, unless
1149 * there is code here to allow for any discrepancy.
1150 * Such code exists for AF_INET and AF_INET6.
1151 *
1152 * Netatalk chooses to return the requested address
1153 * rather than the (correct) allocated address. This
1154 * means that netatalk violates the TPI specification
1155 * (and would not function correctly if used from a
1156 * TLI application), but it does mean that it works
1157 * with sockfs.
1158 *
1159 * As noted above, using the newer XTI bind primitive
1160 * (T_BIND_REQ) in preference to O_T_BIND_REQ would
1161 * allow sockfs to be more sure about whether or not
1162 * the bind request had succeeded (as transports are
1163 * not permitted to bind to a different address than
1164 * that requested - they must return failure).
1165 * Unfortunately, support for T_BIND_REQ may not be
1166 * present in all transport implementations (netatalk,
1167 * for example, doesn't have it), making the
1168 * transition difficult.
1169 */
1170 if (bind_ack->ADDR_length != addrlen) {
1171 /* Assumes that the requested address was in use */
1172 freemsg(mp);
1173 error = EADDRINUSE;
1174 eprintsoline(so, error);
1175 goto done;
1176 }
1177
1178 switch (so->so_family) {
1179 case AF_INET6:
1180 case AF_INET: {
1181 sin_t *rname, *aname;
1182
1183 rname = (sin_t *)addr;
1184 aname = (sin_t *)sti->sti_laddr_sa;
1185
1186 /*
1187 * Take advantage of the alignment
1188 * of sin_port and sin6_port which fall
1189 * in the same place in their data structures.
1190 * Just use sin_port for either address family.
1191 *
1192 * This may become a problem if (heaven forbid)
1193 * there's a separate ipv6port_reserved... :-P
1194 *
1195 * Binding to port 0 has the semantics of letting
1196 * the transport bind to any port.
1197 *
1198 * If the transport is TCP or UDP since we had sent
1199 * a T_BIND_REQ we would not get a port other than
1200 * what we asked for.
1201 */
1202 if (tcp_udp_xport) {
1203 /*
1204 * Pick up the new port number if we bound to
1205 * port 0.
1206 */
1207 if (aname->sin_port == 0)
1208 aname->sin_port = rname->sin_port;
1209 sti->sti_laddr_valid = 1;
1210 break;
1211 }
1212 if (aname->sin_port != 0 &&
1213 aname->sin_port != rname->sin_port) {
1214 freemsg(mp);
1215 error = EADDRINUSE;
1216 eprintsoline(so, error);
1217 goto done;
1218 }
1219 /*
1220 * Pick up the new port number if we bound to port 0.
1221 */
1222 aname->sin_port = rname->sin_port;
1223
1224 /*
1225 * Unfortunately, addresses aren't _quite_ the same.
1226 */
1227 if (so->so_family == AF_INET) {
1228 if (aname->sin_addr.s_addr !=
1229 rname->sin_addr.s_addr) {
1230 freemsg(mp);
1231 error = EADDRNOTAVAIL;
1232 eprintsoline(so, error);
1233 goto done;
1234 }
1235 } else {
1236 sin6_t *rname6 = (sin6_t *)rname;
1237 sin6_t *aname6 = (sin6_t *)aname;
1238
1239 if (!IN6_ARE_ADDR_EQUAL(&aname6->sin6_addr,
1240 &rname6->sin6_addr)) {
1241 freemsg(mp);
1242 error = EADDRNOTAVAIL;
1243 eprintsoline(so, error);
1244 goto done;
1245 }
1246 }
1247 break;
1248 }
1249 case AF_UNIX:
1250 if (bcmp(addr, &sti->sti_ux_laddr, addrlen) != 0) {
1251 freemsg(mp);
1252 error = EADDRINUSE;
1253 eprintsoline(so, error);
1254 eprintso(so,
1255 ("addrlen %d, addr 0x%x, vp %p\n",
1256 addrlen, *((int *)addr),
1257 (void *)sti->sti_ux_bound_vp));
1258 goto done;
1259 }
1260 sti->sti_laddr_valid = 1;
1261 break;
1262 default:
1263 /*
1264 * NOTE: This assumes that addresses can be
1265 * byte-compared for equivalence.
1266 */
1267 if (bcmp(addr, sti->sti_laddr_sa, addrlen) != 0) {
1268 freemsg(mp);
1269 error = EADDRINUSE;
1270 eprintsoline(so, error);
1271 goto done;
1272 }
1273 /*
1274 * Don't mark sti_laddr_valid, as we cannot be
1275 * sure that the returned address is the real
1276 * bound address when talking to an unknown
1277 * transport.
1278 */
1279 break;
1280 }
1281 } else {
1282 /*
1283 * Save for returned address for getsockname.
1284 * Needed for unspecific bind unless transport supports
1285 * the TI_GETMYNAME ioctl.
1286 * Do this for AF_INET{,6} even though they do, as
1287 * caching info here is much better performance than
1288 * a TPI/STREAMS trip to the transport for getsockname.
1289 * Any which can't for some reason _must_ _not_ set
1290 * sti_laddr_valid here for the caching version of
1291 * getsockname to not break;
1292 */
1293 switch (so->so_family) {
1294 case AF_UNIX:
1295 /*
1296 * Record the address bound with the transport
1297 * for use by socketpair.
1298 */
1299 bcopy(addr, &sti->sti_ux_laddr, addrlen);
1300 sti->sti_laddr_valid = 1;
1301 break;
1302 case AF_INET:
1303 case AF_INET6:
1304 ASSERT(sti->sti_laddr_len <= sti->sti_laddr_maxlen);
1305 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
1306 sti->sti_laddr_valid = 1;
1307 break;
1308 default:
1309 /*
1310 * Don't mark sti_laddr_valid, as we cannot be
1311 * sure that the returned address is the real
1312 * bound address when talking to an unknown
1313 * transport.
1314 */
1315 break;
1316 }
1317 }
1318
1319 freemsg(mp);
1320
1321 done:
1322 if (error) {
1323 /* reset state & backlog to values held on entry */
1324 if (clear_acceptconn_on_err == B_TRUE)
1325 so->so_state &= ~SS_ACCEPTCONN;
1326 if (restore_backlog_on_err == B_TRUE)
1327 so->so_backlog = save_so_backlog;
1328
1329 if (unbind_on_err && so->so_state & SS_ISBOUND) {
1330 int err;
1331
1332 err = sotpi_unbind(so, 0);
1333 /* LINTED - statement has no consequent: if */
1334 if (err) {
1335 eprintsoline(so, error);
1336 } else {
1337 ASSERT(!(so->so_state & SS_ISBOUND));
1338 }
1339 }
1340 }
1341 if (!(flags & _SOBIND_LOCK_HELD)) {
1342 so_unlock_single(so, SOLOCKED);
1343 mutex_exit(&so->so_lock);
1344 } else {
1345 ASSERT(MUTEX_HELD(&so->so_lock));
1346 ASSERT(so->so_flag & SOLOCKED);
1347 }
1348 return (error);
1349 }
1350
1351 /* bind the socket */
1352 static int
sotpi_bind(struct sonode * so,struct sockaddr * name,socklen_t namelen,int flags,struct cred * cr)1353 sotpi_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
1354 int flags, struct cred *cr)
1355 {
1356 if ((flags & _SOBIND_SOCKETPAIR) == 0)
1357 return (sotpi_bindlisten(so, name, namelen, 0, flags, cr));
1358
1359 flags &= ~_SOBIND_SOCKETPAIR;
1360 return (sotpi_bindlisten(so, name, namelen, 1, flags, cr));
1361 }
1362
1363 /*
1364 * Unbind a socket - used when bind() fails, when bind() specifies a NULL
1365 * address, or when listen needs to unbind and bind.
1366 * If the _SOUNBIND_REBIND flag is specified the addresses are retained
1367 * so that a sobind can pick them up.
1368 */
1369 static int
sotpi_unbind(struct sonode * so,int flags)1370 sotpi_unbind(struct sonode *so, int flags)
1371 {
1372 struct T_unbind_req unbind_req;
1373 int error = 0;
1374 mblk_t *mp;
1375 sotpi_info_t *sti = SOTOTPI(so);
1376
1377 dprintso(so, 1, ("sotpi_unbind(%p, 0x%x) %s\n",
1378 (void *)so, flags, pr_state(so->so_state, so->so_mode)));
1379
1380 ASSERT(MUTEX_HELD(&so->so_lock));
1381 ASSERT(so->so_flag & SOLOCKED);
1382
1383 if (!(so->so_state & SS_ISBOUND)) {
1384 error = EINVAL;
1385 eprintsoline(so, error);
1386 goto done;
1387 }
1388
1389 mutex_exit(&so->so_lock);
1390
1391 /*
1392 * Flush the read and write side (except stream head read queue)
1393 * and send down T_UNBIND_REQ.
1394 */
1395 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHRW);
1396
1397 unbind_req.PRIM_type = T_UNBIND_REQ;
1398 mp = soallocproto1(&unbind_req, sizeof (unbind_req),
1399 0, _ALLOC_SLEEP, CRED());
1400 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1401 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1402 mutex_enter(&so->so_lock);
1403 if (error) {
1404 eprintsoline(so, error);
1405 goto done;
1406 }
1407
1408 error = sowaitokack(so, T_UNBIND_REQ);
1409 if (error) {
1410 eprintsoline(so, error);
1411 goto done;
1412 }
1413
1414 /*
1415 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1416 * strsock_proto while the lock was dropped above, the unbind
1417 * is allowed to complete.
1418 */
1419 if (!(flags & _SOUNBIND_REBIND)) {
1420 /*
1421 * Clear out bound address.
1422 */
1423 vnode_t *vp;
1424
1425 if ((vp = sti->sti_ux_bound_vp) != NULL) {
1426 sti->sti_ux_bound_vp = NULL;
1427 vn_rele_stream(vp);
1428 }
1429 /* Clear out address */
1430 sti->sti_laddr_len = 0;
1431 }
1432 so->so_state &= ~(SS_ISBOUND|SS_ACCEPTCONN);
1433 sti->sti_laddr_valid = 0;
1434
1435 done:
1436
1437 /* If the caller held the lock don't release it here */
1438 ASSERT(MUTEX_HELD(&so->so_lock));
1439 ASSERT(so->so_flag & SOLOCKED);
1440
1441 return (error);
1442 }
1443
1444 /*
1445 * listen on the socket.
1446 * For TPI conforming transports this has to first unbind with the transport
1447 * and then bind again using the new backlog.
1448 */
1449 /* ARGSUSED */
1450 int
sotpi_listen(struct sonode * so,int backlog,struct cred * cr)1451 sotpi_listen(struct sonode *so, int backlog, struct cred *cr)
1452 {
1453 int error = 0;
1454 sotpi_info_t *sti = SOTOTPI(so);
1455
1456 dprintso(so, 1, ("sotpi_listen(%p, %d) %s\n",
1457 (void *)so, backlog, pr_state(so->so_state, so->so_mode)));
1458
1459 if (sti->sti_serv_type == T_CLTS)
1460 return (EOPNOTSUPP);
1461
1462 /*
1463 * If the socket is ready to accept connections already, then
1464 * return without doing anything. This avoids a problem where
1465 * a second listen() call fails if a connection is pending and
1466 * leaves the socket unbound. Only when we are not unbinding
1467 * with the transport can we safely increase the backlog.
1468 */
1469 if (so->so_state & SS_ACCEPTCONN &&
1470 !((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1471 /*CONSTCOND*/
1472 !solisten_tpi_tcp))
1473 return (0);
1474
1475 if (so->so_state & SS_ISCONNECTED)
1476 return (EINVAL);
1477
1478 mutex_enter(&so->so_lock);
1479 so_lock_single(so); /* Set SOLOCKED */
1480
1481 /*
1482 * If the listen doesn't change the backlog we do nothing.
1483 * This avoids an EPROTO error from the transport.
1484 */
1485 if ((so->so_state & SS_ACCEPTCONN) &&
1486 so->so_backlog == backlog)
1487 goto done;
1488
1489 if (!(so->so_state & SS_ISBOUND)) {
1490 /*
1491 * Must have been explicitly bound in the UNIX domain.
1492 */
1493 if (so->so_family == AF_UNIX) {
1494 error = EINVAL;
1495 goto done;
1496 }
1497 error = sotpi_bindlisten(so, NULL, 0, backlog,
1498 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1499 } else if (backlog > 0) {
1500 /*
1501 * AF_INET{,6} hack to avoid losing the port.
1502 * Assumes that all AF_INET{,6} transports can handle a
1503 * O_T_BIND_REQ with a non-zero CONIND_number when the TPI
1504 * has already bound thus it is possible to avoid the unbind.
1505 */
1506 if (!((so->so_family == AF_INET || so->so_family == AF_INET6) &&
1507 /*CONSTCOND*/
1508 !solisten_tpi_tcp)) {
1509 error = sotpi_unbind(so, _SOUNBIND_REBIND);
1510 if (error)
1511 goto done;
1512 }
1513 error = sotpi_bindlisten(so, NULL, 0, backlog,
1514 _SOBIND_REBIND|_SOBIND_LOCK_HELD|_SOBIND_LISTEN, cr);
1515 } else {
1516 so->so_state |= SS_ACCEPTCONN;
1517 so->so_backlog = backlog;
1518 }
1519 if (error)
1520 goto done;
1521 ASSERT(so->so_state & SS_ACCEPTCONN);
1522 done:
1523 so_unlock_single(so, SOLOCKED);
1524 mutex_exit(&so->so_lock);
1525 return (error);
1526 }
1527
1528 /*
1529 * Disconnect either a specified seqno or all (-1).
1530 * The former is used on listening sockets only.
1531 *
1532 * When seqno == -1 sodisconnect could call sotpi_unbind. However,
1533 * the current use of sodisconnect(seqno == -1) is only for shutdown
1534 * so there is no point (and potentially incorrect) to unbind.
1535 */
1536 static int
sodisconnect(struct sonode * so,t_scalar_t seqno,int flags)1537 sodisconnect(struct sonode *so, t_scalar_t seqno, int flags)
1538 {
1539 struct T_discon_req discon_req;
1540 int error = 0;
1541 mblk_t *mp;
1542
1543 dprintso(so, 1, ("sodisconnect(%p, %d, 0x%x) %s\n",
1544 (void *)so, seqno, flags, pr_state(so->so_state, so->so_mode)));
1545
1546 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1547 mutex_enter(&so->so_lock);
1548 so_lock_single(so); /* Set SOLOCKED */
1549 } else {
1550 ASSERT(MUTEX_HELD(&so->so_lock));
1551 ASSERT(so->so_flag & SOLOCKED);
1552 }
1553
1554 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ACCEPTCONN))) {
1555 error = EINVAL;
1556 eprintsoline(so, error);
1557 goto done;
1558 }
1559
1560 mutex_exit(&so->so_lock);
1561 /*
1562 * Flush the write side (unless this is a listener)
1563 * and then send down a T_DISCON_REQ.
1564 * (Don't flush on listener since it could flush {O_}T_CONN_RES
1565 * and other messages.)
1566 */
1567 if (!(so->so_state & SS_ACCEPTCONN))
1568 (void) putnextctl1(strvp2wq(SOTOV(so)), M_FLUSH, FLUSHW);
1569
1570 discon_req.PRIM_type = T_DISCON_REQ;
1571 discon_req.SEQ_number = seqno;
1572 mp = soallocproto1(&discon_req, sizeof (discon_req),
1573 0, _ALLOC_SLEEP, CRED());
1574 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
1575 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1576 mutex_enter(&so->so_lock);
1577 if (error) {
1578 eprintsoline(so, error);
1579 goto done;
1580 }
1581
1582 error = sowaitokack(so, T_DISCON_REQ);
1583 if (error) {
1584 eprintsoline(so, error);
1585 goto done;
1586 }
1587 /*
1588 * Even if some TPI message (e.g. T_DISCON_IND) was received in
1589 * strsock_proto while the lock was dropped above, the disconnect
1590 * is allowed to complete. However, it is not possible to
1591 * assert that SS_ISCONNECTED|SS_ISCONNECTING are set.
1592 */
1593 so->so_state &= ~(SS_ISCONNECTED|SS_ISCONNECTING);
1594 SOTOTPI(so)->sti_laddr_valid = 0;
1595 SOTOTPI(so)->sti_faddr_valid = 0;
1596 done:
1597 if (!(flags & _SODISCONNECT_LOCK_HELD)) {
1598 so_unlock_single(so, SOLOCKED);
1599 mutex_exit(&so->so_lock);
1600 } else {
1601 /* If the caller held the lock don't release it here */
1602 ASSERT(MUTEX_HELD(&so->so_lock));
1603 ASSERT(so->so_flag & SOLOCKED);
1604 }
1605 return (error);
1606 }
1607
1608 /* ARGSUSED */
1609 int
sotpi_accept(struct sonode * so,int fflag,struct cred * cr,struct sonode ** nsop)1610 sotpi_accept(struct sonode *so, int fflag, struct cred *cr,
1611 struct sonode **nsop)
1612 {
1613 struct T_conn_ind *conn_ind;
1614 struct T_conn_res *conn_res;
1615 int error = 0;
1616 mblk_t *mp, *ack_mp;
1617 struct sonode *nso;
1618 vnode_t *nvp;
1619 void *src;
1620 t_uscalar_t srclen;
1621 void *opt;
1622 t_uscalar_t optlen;
1623 t_scalar_t PRIM_type;
1624 t_scalar_t SEQ_number;
1625 size_t sinlen;
1626 sotpi_info_t *sti = SOTOTPI(so);
1627 sotpi_info_t *nsti;
1628
1629 dprintso(so, 1, ("sotpi_accept(%p, 0x%x, %p) %s\n",
1630 (void *)so, fflag, (void *)nsop,
1631 pr_state(so->so_state, so->so_mode)));
1632
1633 /*
1634 * Defer single-threading the accepting socket until
1635 * the T_CONN_IND has been received and parsed and the
1636 * new sonode has been opened.
1637 */
1638
1639 /* Check that we are not already connected */
1640 if ((so->so_state & SS_ACCEPTCONN) == 0)
1641 goto conn_bad;
1642
1643 if ((error = sowaitconnind(so, fflag, &mp)) != 0)
1644 goto e_bad;
1645
1646 ASSERT(mp != NULL);
1647 conn_ind = (struct T_conn_ind *)mp->b_rptr;
1648
1649 /*
1650 * Save SEQ_number for error paths.
1651 */
1652 SEQ_number = conn_ind->SEQ_number;
1653
1654 srclen = conn_ind->SRC_length;
1655 src = sogetoff(mp, conn_ind->SRC_offset, srclen, 1);
1656 if (src == NULL) {
1657 error = EPROTO;
1658 freemsg(mp);
1659 eprintsoline(so, error);
1660 goto disconnect_unlocked;
1661 }
1662 optlen = conn_ind->OPT_length;
1663 switch (so->so_family) {
1664 case AF_INET:
1665 case AF_INET6:
1666 if ((optlen == sizeof (intptr_t)) && (sti->sti_direct != 0)) {
1667 bcopy(mp->b_rptr + conn_ind->OPT_offset,
1668 &opt, conn_ind->OPT_length);
1669 } else {
1670 /*
1671 * The transport (in this case TCP) hasn't sent up
1672 * a pointer to an instance for the accept fast-path.
1673 * Disable fast-path completely because the call to
1674 * sotpi_create() below would otherwise create an
1675 * incomplete TCP instance, which would lead to
1676 * problems when sockfs sends a normal T_CONN_RES
1677 * message down the new stream.
1678 */
1679 if (sti->sti_direct) {
1680 int rval;
1681 /*
1682 * For consistency we inform tcp to disable
1683 * direct interface on the listener, though
1684 * we can certainly live without doing this
1685 * because no data will ever travel upstream
1686 * on the listening socket.
1687 */
1688 sti->sti_direct = 0;
1689 (void) strioctl(SOTOV(so), _SIOCSOCKFALLBACK,
1690 0, 0, K_TO_K, cr, &rval);
1691 }
1692 opt = NULL;
1693 optlen = 0;
1694 }
1695 break;
1696 case AF_UNIX:
1697 default:
1698 if (optlen != 0) {
1699 opt = sogetoff(mp, conn_ind->OPT_offset, optlen,
1700 __TPI_ALIGN_SIZE);
1701 if (opt == NULL) {
1702 error = EPROTO;
1703 freemsg(mp);
1704 eprintsoline(so, error);
1705 goto disconnect_unlocked;
1706 }
1707 }
1708 if (so->so_family == AF_UNIX) {
1709 if (!sti->sti_faddr_noxlate) {
1710 src = NULL;
1711 srclen = 0;
1712 }
1713 /* Extract src address from options */
1714 if (optlen != 0)
1715 so_getopt_srcaddr(opt, optlen, &src, &srclen);
1716 }
1717 break;
1718 }
1719
1720 /*
1721 * Create the new socket.
1722 */
1723 nso = socket_newconn(so, NULL, NULL, SOCKET_SLEEP, &error);
1724 if (nso == NULL) {
1725 ASSERT(error != 0);
1726 /*
1727 * Accept can not fail with ENOBUFS. sotpi_create
1728 * sleeps waiting for memory until a signal is caught
1729 * so return EINTR.
1730 */
1731 freemsg(mp);
1732 if (error == ENOBUFS)
1733 error = EINTR;
1734 goto e_disc_unl;
1735 }
1736 nvp = SOTOV(nso);
1737 nsti = SOTOTPI(nso);
1738
1739 #ifdef DEBUG
1740 /*
1741 * SO_DEBUG is used to trigger the dprint* and eprint* macros thus
1742 * it's inherited early to allow debugging of the accept code itself.
1743 */
1744 nso->so_options |= so->so_options & SO_DEBUG;
1745 #endif /* DEBUG */
1746
1747 /*
1748 * Save the SRC address from the T_CONN_IND
1749 * for getpeername to work on AF_UNIX and on transports that do not
1750 * support TI_GETPEERNAME.
1751 *
1752 * NOTE: AF_UNIX NUL termination is ensured by the sender's
1753 * copyin_name().
1754 */
1755 if (srclen > (t_uscalar_t)nsti->sti_faddr_maxlen) {
1756 error = EINVAL;
1757 freemsg(mp);
1758 eprintsoline(so, error);
1759 goto disconnect_vp_unlocked;
1760 }
1761 nsti->sti_faddr_len = (socklen_t)srclen;
1762 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
1763 bcopy(src, nsti->sti_faddr_sa, srclen);
1764 nsti->sti_faddr_valid = 1;
1765
1766 /*
1767 * Record so_peercred and so_cpid from a cred in the T_CONN_IND.
1768 */
1769 if ((DB_REF(mp) > 1) || MBLKSIZE(mp) <
1770 (sizeof (struct T_conn_res) + sizeof (intptr_t))) {
1771 cred_t *cr;
1772 pid_t cpid;
1773
1774 cr = msg_getcred(mp, &cpid);
1775 if (cr != NULL) {
1776 crhold(cr);
1777 nso->so_peercred = cr;
1778 nso->so_cpid = cpid;
1779 }
1780 freemsg(mp);
1781
1782 mp = soallocproto1(NULL, sizeof (struct T_conn_res) +
1783 sizeof (intptr_t), 0, _ALLOC_INTR, cr);
1784 if (mp == NULL) {
1785 /*
1786 * Accept can not fail with ENOBUFS.
1787 * A signal was caught so return EINTR.
1788 */
1789 error = EINTR;
1790 eprintsoline(so, error);
1791 goto disconnect_vp_unlocked;
1792 }
1793 conn_res = (struct T_conn_res *)mp->b_rptr;
1794 } else {
1795 /*
1796 * For efficency reasons we use msg_extractcred; no crhold
1797 * needed since db_credp is cleared (i.e., we move the cred
1798 * from the message to so_peercred.
1799 */
1800 nso->so_peercred = msg_extractcred(mp, &nso->so_cpid);
1801
1802 mp->b_rptr = DB_BASE(mp);
1803 conn_res = (struct T_conn_res *)mp->b_rptr;
1804 mp->b_wptr = mp->b_rptr + sizeof (struct T_conn_res);
1805
1806 mblk_setcred(mp, cr, curproc->p_pid);
1807 }
1808
1809 /*
1810 * New socket must be bound at least in sockfs and, except for AF_INET,
1811 * (or AF_INET6) it also has to be bound in the transport provider.
1812 * We set the local address in the sonode from the T_OK_ACK of the
1813 * T_CONN_RES. For this reason the address we bind to here isn't
1814 * important.
1815 */
1816 if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
1817 /*CONSTCOND*/
1818 nso->so_type == SOCK_STREAM && !soaccept_tpi_tcp) {
1819 /*
1820 * Optimization for AF_INET{,6} transports
1821 * that can handle a T_CONN_RES without being bound.
1822 */
1823 mutex_enter(&nso->so_lock);
1824 so_automatic_bind(nso);
1825 mutex_exit(&nso->so_lock);
1826 } else {
1827 /* Perform NULL bind with the transport provider. */
1828 if ((error = sotpi_bind(nso, NULL, 0, _SOBIND_UNSPEC,
1829 cr)) != 0) {
1830 ASSERT(error != ENOBUFS);
1831 freemsg(mp);
1832 eprintsoline(nso, error);
1833 goto disconnect_vp_unlocked;
1834 }
1835 }
1836
1837 /*
1838 * Inherit SIOCSPGRP, SS_ASYNC before we send the {O_}T_CONN_RES
1839 * so that any data arriving on the new socket will cause the
1840 * appropriate signals to be delivered for the new socket.
1841 *
1842 * No other thread (except strsock_proto and strsock_misc)
1843 * can access the new socket thus we relax the locking.
1844 */
1845 nso->so_pgrp = so->so_pgrp;
1846 nso->so_state |= so->so_state & SS_ASYNC;
1847 nsti->sti_faddr_noxlate = sti->sti_faddr_noxlate;
1848
1849 if (nso->so_pgrp != 0) {
1850 if ((error = so_set_events(nso, nvp, cr)) != 0) {
1851 eprintsoline(nso, error);
1852 error = 0;
1853 nso->so_pgrp = 0;
1854 }
1855 }
1856
1857 /*
1858 * Make note of the socket level options. TCP and IP level options
1859 * are already inherited. We could do all this after accept is
1860 * successful but doing it here simplifies code and no harm done
1861 * for error case.
1862 */
1863 nso->so_options = so->so_options & (SO_DEBUG|SO_REUSEADDR|SO_KEEPALIVE|
1864 SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
1865 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1866 nso->so_sndbuf = so->so_sndbuf;
1867 nso->so_rcvbuf = so->so_rcvbuf;
1868 if (nso->so_options & SO_LINGER)
1869 nso->so_linger = so->so_linger;
1870
1871 /*
1872 * Note that the following sti_direct code path should be
1873 * removed once we are confident that the direct sockets
1874 * do not result in any degradation.
1875 */
1876 if (sti->sti_direct) {
1877
1878 ASSERT(opt != NULL);
1879
1880 conn_res->OPT_length = optlen;
1881 conn_res->OPT_offset = MBLKL(mp);
1882 bcopy(&opt, mp->b_wptr, optlen);
1883 mp->b_wptr += optlen;
1884 conn_res->PRIM_type = T_CONN_RES;
1885 conn_res->ACCEPTOR_id = 0;
1886 PRIM_type = T_CONN_RES;
1887
1888 /* Send down the T_CONN_RES on acceptor STREAM */
1889 error = kstrputmsg(SOTOV(nso), mp, NULL,
1890 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
1891 if (error) {
1892 mutex_enter(&so->so_lock);
1893 so_lock_single(so);
1894 eprintsoline(so, error);
1895 goto disconnect_vp;
1896 }
1897 mutex_enter(&nso->so_lock);
1898 error = sowaitprim(nso, T_CONN_RES, T_OK_ACK,
1899 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
1900 if (error) {
1901 mutex_exit(&nso->so_lock);
1902 mutex_enter(&so->so_lock);
1903 so_lock_single(so);
1904 eprintsoline(so, error);
1905 goto disconnect_vp;
1906 }
1907 if (nso->so_family == AF_INET) {
1908 sin_t *sin;
1909
1910 sin = (sin_t *)(ack_mp->b_rptr +
1911 sizeof (struct T_ok_ack));
1912 bcopy(sin, nsti->sti_laddr_sa, sizeof (sin_t));
1913 nsti->sti_laddr_len = sizeof (sin_t);
1914 } else {
1915 sin6_t *sin6;
1916
1917 sin6 = (sin6_t *)(ack_mp->b_rptr +
1918 sizeof (struct T_ok_ack));
1919 bcopy(sin6, nsti->sti_laddr_sa, sizeof (sin6_t));
1920 nsti->sti_laddr_len = sizeof (sin6_t);
1921 }
1922 freemsg(ack_mp);
1923
1924 nso->so_state |= SS_ISCONNECTED;
1925 nso->so_proto_handle = (sock_lower_handle_t)opt;
1926 nsti->sti_laddr_valid = 1;
1927
1928 mutex_exit(&nso->so_lock);
1929
1930 /*
1931 * It's possible, through the use of autopush for example,
1932 * that the acceptor stream may not support sti_direct
1933 * semantics. If the new socket does not support sti_direct
1934 * we issue a _SIOCSOCKFALLBACK to inform the transport
1935 * as we would in the I_PUSH case.
1936 */
1937 if (nsti->sti_direct == 0) {
1938 int rval;
1939
1940 if ((error = strioctl(SOTOV(nso), _SIOCSOCKFALLBACK,
1941 0, 0, K_TO_K, cr, &rval)) != 0) {
1942 mutex_enter(&so->so_lock);
1943 so_lock_single(so);
1944 eprintsoline(so, error);
1945 goto disconnect_vp;
1946 }
1947 }
1948
1949 /*
1950 * Pass out new socket.
1951 */
1952 if (nsop != NULL)
1953 *nsop = nso;
1954
1955 return (0);
1956 }
1957
1958 /*
1959 * This is the non-performance case for sockets (e.g. AF_UNIX sockets)
1960 * which don't support the FireEngine accept fast-path. It is also
1961 * used when the virtual "sockmod" has been I_POP'd and I_PUSH'd
1962 * again. Neither sockfs nor TCP attempt to find out if some other
1963 * random module has been inserted in between (in which case we
1964 * should follow TLI accept behaviour). We blindly assume the worst
1965 * case and revert back to old behaviour i.e. TCP will not send us
1966 * any option (eager) and the accept should happen on the listener
1967 * queue. Any queued T_conn_ind have already got their options removed
1968 * by so_sock2_stream() when "sockmod" was I_POP'd.
1969 */
1970 /*
1971 * Fill in the {O_}T_CONN_RES before getting SOLOCKED.
1972 */
1973 if ((nso->so_mode & SM_ACCEPTOR_ID) == 0) {
1974 #ifdef _ILP32
1975 queue_t *q;
1976
1977 /*
1978 * Find read queue in driver
1979 * Can safely do this since we "own" nso/nvp.
1980 */
1981 q = strvp2wq(nvp)->q_next;
1982 while (SAMESTR(q))
1983 q = q->q_next;
1984 q = RD(q);
1985 conn_res->ACCEPTOR_id = (t_uscalar_t)q;
1986 #else
1987 conn_res->ACCEPTOR_id = (t_uscalar_t)getminor(nvp->v_rdev);
1988 #endif /* _ILP32 */
1989 conn_res->PRIM_type = O_T_CONN_RES;
1990 PRIM_type = O_T_CONN_RES;
1991 } else {
1992 conn_res->ACCEPTOR_id = nsti->sti_acceptor_id;
1993 conn_res->PRIM_type = T_CONN_RES;
1994 PRIM_type = T_CONN_RES;
1995 }
1996 conn_res->SEQ_number = SEQ_number;
1997 conn_res->OPT_length = 0;
1998 conn_res->OPT_offset = 0;
1999
2000 mutex_enter(&so->so_lock);
2001 so_lock_single(so); /* Set SOLOCKED */
2002 mutex_exit(&so->so_lock);
2003
2004 error = kstrputmsg(SOTOV(so), mp, NULL,
2005 0, 0, MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2006 mutex_enter(&so->so_lock);
2007 if (error) {
2008 eprintsoline(so, error);
2009 goto disconnect_vp;
2010 }
2011 error = sowaitprim(so, PRIM_type, T_OK_ACK,
2012 (t_uscalar_t)sizeof (struct T_ok_ack), &ack_mp, 0);
2013 if (error) {
2014 eprintsoline(so, error);
2015 goto disconnect_vp;
2016 }
2017 mutex_exit(&so->so_lock);
2018 /*
2019 * If there is a sin/sin6 appended onto the T_OK_ACK use
2020 * that to set the local address. If this is not present
2021 * then we zero out the address and don't set the
2022 * sti_laddr_valid bit. For AF_UNIX endpoints we copy over
2023 * the pathname from the listening socket.
2024 * In the case where this is TCP or an AF_UNIX socket the
2025 * client side may have queued data or a T_ORDREL in the
2026 * transport. Having now sent the T_CONN_RES we may receive
2027 * those queued messages at any time. Hold the acceptor
2028 * so_lock until its state and laddr are finalized.
2029 */
2030 mutex_enter(&nso->so_lock);
2031 sinlen = (nso->so_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t);
2032 if ((nso->so_family == AF_INET || nso->so_family == AF_INET6) &&
2033 MBLKL(ack_mp) == (sizeof (struct T_ok_ack) + sinlen)) {
2034 ack_mp->b_rptr += sizeof (struct T_ok_ack);
2035 bcopy(ack_mp->b_rptr, nsti->sti_laddr_sa, sinlen);
2036 nsti->sti_laddr_len = sinlen;
2037 nsti->sti_laddr_valid = 1;
2038 } else if (nso->so_family == AF_UNIX) {
2039 ASSERT(so->so_family == AF_UNIX);
2040 nsti->sti_laddr_len = sti->sti_laddr_len;
2041 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2042 bcopy(sti->sti_laddr_sa, nsti->sti_laddr_sa,
2043 nsti->sti_laddr_len);
2044 nsti->sti_laddr_valid = 1;
2045 } else {
2046 nsti->sti_laddr_len = sti->sti_laddr_len;
2047 ASSERT(nsti->sti_laddr_len <= nsti->sti_laddr_maxlen);
2048 bzero(nsti->sti_laddr_sa, nsti->sti_addr_size);
2049 nsti->sti_laddr_sa->sa_family = nso->so_family;
2050 }
2051 nso->so_state |= SS_ISCONNECTED;
2052 mutex_exit(&nso->so_lock);
2053
2054 freemsg(ack_mp);
2055
2056 mutex_enter(&so->so_lock);
2057 so_unlock_single(so, SOLOCKED);
2058 mutex_exit(&so->so_lock);
2059
2060 /*
2061 * Pass out new socket.
2062 */
2063 if (nsop != NULL)
2064 *nsop = nso;
2065
2066 return (0);
2067
2068 e_disc_unl:
2069 eprintsoline(so, error);
2070 goto disconnect_unlocked;
2071
2072 disconnect_vp_unlocked:
2073 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2074 VN_RELE(nvp);
2075 disconnect_unlocked:
2076 (void) sodisconnect(so, SEQ_number, 0);
2077 return (error);
2078
2079 disconnect_vp:
2080 (void) sodisconnect(so, SEQ_number, _SODISCONNECT_LOCK_HELD);
2081 so_unlock_single(so, SOLOCKED);
2082 mutex_exit(&so->so_lock);
2083 (void) VOP_CLOSE(nvp, 0, 1, 0, cr, NULL);
2084 VN_RELE(nvp);
2085 return (error);
2086
2087 conn_bad: /* Note: SunOS 4/BSD unconditionally returns EINVAL here */
2088 error = (so->so_type == SOCK_DGRAM || so->so_type == SOCK_RAW)
2089 ? EOPNOTSUPP : EINVAL;
2090 e_bad:
2091 eprintsoline(so, error);
2092 return (error);
2093 }
2094
2095 /*
2096 * connect a socket.
2097 *
2098 * Allow SOCK_DGRAM sockets to reconnect (by specifying a new address) and to
2099 * unconnect (by specifying a null address).
2100 */
2101 int
sotpi_connect(struct sonode * so,struct sockaddr * name,socklen_t namelen,int fflag,int flags,struct cred * cr)2102 sotpi_connect(struct sonode *so,
2103 struct sockaddr *name,
2104 socklen_t namelen,
2105 int fflag,
2106 int flags,
2107 struct cred *cr)
2108 {
2109 struct T_conn_req conn_req;
2110 int error = 0;
2111 mblk_t *mp;
2112 void *src;
2113 socklen_t srclen;
2114 void *addr;
2115 socklen_t addrlen;
2116 boolean_t need_unlock;
2117 sotpi_info_t *sti = SOTOTPI(so);
2118
2119 dprintso(so, 1, ("sotpi_connect(%p, %p, %d, 0x%x, 0x%x) %s\n",
2120 (void *)so, (void *)name, namelen, fflag, flags,
2121 pr_state(so->so_state, so->so_mode)));
2122
2123 /*
2124 * Preallocate the T_CONN_REQ mblk before grabbing SOLOCKED to
2125 * avoid sleeping for memory with SOLOCKED held.
2126 * We know that the T_CONN_REQ can't be larger than 2 * sti_faddr_maxlen
2127 * + sizeof (struct T_opthdr).
2128 * (the AF_UNIX so_ux_addr_xlate() does not make the address
2129 * exceed sti_faddr_maxlen).
2130 */
2131 mp = soallocproto(sizeof (struct T_conn_req) +
2132 2 * sti->sti_faddr_maxlen + sizeof (struct T_opthdr), _ALLOC_INTR,
2133 cr);
2134 if (mp == NULL) {
2135 /*
2136 * Connect can not fail with ENOBUFS. A signal was
2137 * caught so return EINTR.
2138 */
2139 error = EINTR;
2140 eprintsoline(so, error);
2141 return (error);
2142 }
2143
2144 mutex_enter(&so->so_lock);
2145 /*
2146 * Make sure there is a preallocated T_unbind_req message
2147 * before any binding. This message is allocated when the
2148 * socket is created. Since another thread can consume
2149 * so_unbind_mp by the time we return from so_lock_single(),
2150 * we should check the availability of so_unbind_mp after
2151 * we return from so_lock_single().
2152 */
2153
2154 so_lock_single(so); /* Set SOLOCKED */
2155 need_unlock = B_TRUE;
2156
2157 if (sti->sti_unbind_mp == NULL) {
2158 dprintso(so, 1, ("sotpi_connect: allocating unbind_req\n"));
2159 /* NOTE: holding so_lock while sleeping */
2160 sti->sti_unbind_mp =
2161 soallocproto(sizeof (struct T_unbind_req), _ALLOC_INTR, cr);
2162 if (sti->sti_unbind_mp == NULL) {
2163 error = EINTR;
2164 goto done;
2165 }
2166 }
2167
2168 /*
2169 * Can't have done a listen before connecting.
2170 */
2171 if (so->so_state & SS_ACCEPTCONN) {
2172 error = EOPNOTSUPP;
2173 goto done;
2174 }
2175
2176 /*
2177 * Must be bound with the transport
2178 */
2179 if (!(so->so_state & SS_ISBOUND)) {
2180 if ((so->so_family == AF_INET || so->so_family == AF_INET6) &&
2181 /*CONSTCOND*/
2182 so->so_type == SOCK_STREAM && !soconnect_tpi_tcp) {
2183 /*
2184 * Optimization for AF_INET{,6} transports
2185 * that can handle a T_CONN_REQ without being bound.
2186 */
2187 so_automatic_bind(so);
2188 } else {
2189 error = sotpi_bind(so, NULL, 0,
2190 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
2191 if (error)
2192 goto done;
2193 }
2194 ASSERT(so->so_state & SS_ISBOUND);
2195 flags |= _SOCONNECT_DID_BIND;
2196 }
2197
2198 /*
2199 * Handle a connect to a name parameter of type AF_UNSPEC like a
2200 * connect to a null address. This is the portable method to
2201 * unconnect a socket.
2202 */
2203 if ((namelen >= sizeof (sa_family_t)) &&
2204 (name->sa_family == AF_UNSPEC)) {
2205 name = NULL;
2206 namelen = 0;
2207 }
2208
2209 /*
2210 * Check that we are not already connected.
2211 * A connection-oriented socket cannot be reconnected.
2212 * A connected connection-less socket can be
2213 * - connected to a different address by a subsequent connect
2214 * - "unconnected" by a connect to the NULL address
2215 */
2216 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) {
2217 ASSERT(!(flags & _SOCONNECT_DID_BIND));
2218 if (so->so_mode & SM_CONNREQUIRED) {
2219 /* Connection-oriented socket */
2220 error = so->so_state & SS_ISCONNECTED ?
2221 EISCONN : EALREADY;
2222 goto done;
2223 }
2224 /* Connection-less socket */
2225 if (name == NULL) {
2226 /*
2227 * Remove the connected state and clear SO_DGRAM_ERRIND
2228 * since it was set when the socket was connected.
2229 * If this is UDP also send down a T_DISCON_REQ.
2230 */
2231 int val;
2232
2233 if ((so->so_family == AF_INET ||
2234 so->so_family == AF_INET6) &&
2235 (so->so_type == SOCK_DGRAM ||
2236 so->so_type == SOCK_RAW) &&
2237 /*CONSTCOND*/
2238 !soconnect_tpi_udp) {
2239 /* XXX What about implicitly unbinding here? */
2240 error = sodisconnect(so, -1,
2241 _SODISCONNECT_LOCK_HELD);
2242 } else {
2243 so->so_state &=
2244 ~(SS_ISCONNECTED | SS_ISCONNECTING);
2245 sti->sti_faddr_valid = 0;
2246 sti->sti_faddr_len = 0;
2247 }
2248
2249 /* Remove SOLOCKED since setsockopt will grab it */
2250 so_unlock_single(so, SOLOCKED);
2251 mutex_exit(&so->so_lock);
2252
2253 val = 0;
2254 (void) sotpi_setsockopt(so, SOL_SOCKET,
2255 SO_DGRAM_ERRIND, &val, (t_uscalar_t)sizeof (val),
2256 cr);
2257
2258 mutex_enter(&so->so_lock);
2259 so_lock_single(so); /* Set SOLOCKED */
2260 goto done;
2261 }
2262 }
2263 ASSERT(so->so_state & SS_ISBOUND);
2264
2265 if (name == NULL || namelen == 0) {
2266 error = EINVAL;
2267 goto done;
2268 }
2269 /*
2270 * Mark the socket if sti_faddr_sa represents the transport level
2271 * address.
2272 */
2273 if (flags & _SOCONNECT_NOXLATE) {
2274 struct sockaddr_ux *soaddr_ux;
2275
2276 ASSERT(so->so_family == AF_UNIX);
2277 if (namelen != sizeof (struct sockaddr_ux)) {
2278 error = EINVAL;
2279 goto done;
2280 }
2281 soaddr_ux = (struct sockaddr_ux *)name;
2282 name = (struct sockaddr *)&soaddr_ux->sou_addr;
2283 namelen = sizeof (soaddr_ux->sou_addr);
2284 sti->sti_faddr_noxlate = 1;
2285 }
2286
2287 /*
2288 * Length and family checks.
2289 */
2290 error = so_addr_verify(so, name, namelen);
2291 if (error)
2292 goto bad;
2293
2294 /*
2295 * Save foreign address. Needed for AF_UNIX as well as
2296 * transport providers that do not support TI_GETPEERNAME.
2297 * Also used for cached foreign address for TCP and UDP.
2298 */
2299 if (namelen > (t_uscalar_t)sti->sti_faddr_maxlen) {
2300 error = EINVAL;
2301 goto done;
2302 }
2303 sti->sti_faddr_len = (socklen_t)namelen;
2304 ASSERT(sti->sti_faddr_len <= sti->sti_faddr_maxlen);
2305 bcopy(name, sti->sti_faddr_sa, namelen);
2306 sti->sti_faddr_valid = 1;
2307
2308 if (so->so_family == AF_UNIX) {
2309 if (sti->sti_faddr_noxlate) {
2310 /*
2311 * sti_faddr is a transport-level address, so
2312 * don't pass it as an option. Do save it in
2313 * sti_ux_faddr, used for connected DG send.
2314 */
2315 src = NULL;
2316 srclen = 0;
2317 addr = sti->sti_faddr_sa;
2318 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2319 bcopy(addr, &sti->sti_ux_faddr,
2320 sizeof (sti->sti_ux_faddr));
2321 } else {
2322 /*
2323 * Pass the sockaddr_un source address as an option
2324 * and translate the remote address.
2325 * Holding so_lock thus sti_laddr_sa can not change.
2326 */
2327 src = sti->sti_laddr_sa;
2328 srclen = (t_uscalar_t)sti->sti_laddr_len;
2329 dprintso(so, 1,
2330 ("sotpi_connect UNIX: srclen %d, src %p\n",
2331 srclen, src));
2332 /*
2333 * Translate the destination address into our
2334 * internal form, and save it in sti_ux_faddr.
2335 * After this call, addr==&sti->sti_ux_taddr,
2336 * and we copy that to sti->sti_ux_faddr so
2337 * we save the connected peer address.
2338 */
2339 error = so_ux_addr_xlate(so,
2340 sti->sti_faddr_sa, (socklen_t)sti->sti_faddr_len,
2341 (flags & _SOCONNECT_XPG4_2),
2342 &addr, &addrlen);
2343 if (error)
2344 goto bad;
2345 bcopy(&sti->sti_ux_taddr, &sti->sti_ux_faddr,
2346 sizeof (sti->sti_ux_faddr));
2347 }
2348 } else {
2349 addr = sti->sti_faddr_sa;
2350 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2351 src = NULL;
2352 srclen = 0;
2353 }
2354 /*
2355 * When connecting a datagram socket we issue the SO_DGRAM_ERRIND
2356 * option which asks the transport provider to send T_UDERR_IND
2357 * messages. These T_UDERR_IND messages are used to return connected
2358 * style errors (e.g. ECONNRESET) for connected datagram sockets.
2359 *
2360 * In addition, for UDP (and SOCK_RAW AF_INET{,6} sockets)
2361 * we send down a T_CONN_REQ. This is needed to let the
2362 * transport assign a local address that is consistent with
2363 * the remote address. Applications depend on a getsockname()
2364 * after a connect() to retrieve the "source" IP address for
2365 * the connected socket. Invalidate the cached local address
2366 * to force getsockname() to enquire of the transport.
2367 */
2368 if (!(so->so_mode & SM_CONNREQUIRED)) {
2369 /*
2370 * Datagram socket.
2371 */
2372 int32_t val;
2373
2374 so_unlock_single(so, SOLOCKED);
2375 mutex_exit(&so->so_lock);
2376
2377 val = 1;
2378 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DGRAM_ERRIND,
2379 &val, (t_uscalar_t)sizeof (val), cr);
2380
2381 mutex_enter(&so->so_lock);
2382 so_lock_single(so); /* Set SOLOCKED */
2383 if ((so->so_family != AF_INET && so->so_family != AF_INET6) ||
2384 (so->so_type != SOCK_DGRAM && so->so_type != SOCK_RAW) ||
2385 soconnect_tpi_udp) {
2386 soisconnected(so);
2387 goto done;
2388 }
2389 /*
2390 * Send down T_CONN_REQ etc.
2391 * Clear fflag to avoid returning EWOULDBLOCK.
2392 */
2393 fflag = 0;
2394 ASSERT(so->so_family != AF_UNIX);
2395 sti->sti_laddr_valid = 0;
2396 } else if (sti->sti_laddr_len != 0) {
2397 /*
2398 * If the local address or port was "any" then it may be
2399 * changed by the transport as a result of the
2400 * connect. Invalidate the cached version if we have one.
2401 */
2402 switch (so->so_family) {
2403 case AF_INET:
2404 ASSERT(sti->sti_laddr_len == (socklen_t)sizeof (sin_t));
2405 if (((sin_t *)sti->sti_laddr_sa)->sin_addr.s_addr ==
2406 INADDR_ANY ||
2407 ((sin_t *)sti->sti_laddr_sa)->sin_port == 0)
2408 sti->sti_laddr_valid = 0;
2409 break;
2410
2411 case AF_INET6:
2412 ASSERT(sti->sti_laddr_len ==
2413 (socklen_t)sizeof (sin6_t));
2414 if (IN6_IS_ADDR_UNSPECIFIED(
2415 &((sin6_t *)sti->sti_laddr_sa) ->sin6_addr) ||
2416 IN6_IS_ADDR_V4MAPPED_ANY(
2417 &((sin6_t *)sti->sti_laddr_sa)->sin6_addr) ||
2418 ((sin6_t *)sti->sti_laddr_sa)->sin6_port == 0)
2419 sti->sti_laddr_valid = 0;
2420 break;
2421
2422 default:
2423 break;
2424 }
2425 }
2426
2427 /*
2428 * Check for failure of an earlier call
2429 */
2430 if (so->so_error != 0)
2431 goto so_bad;
2432
2433 /*
2434 * Send down T_CONN_REQ. Message was allocated above.
2435 */
2436 conn_req.PRIM_type = T_CONN_REQ;
2437 conn_req.DEST_length = addrlen;
2438 conn_req.DEST_offset = (t_scalar_t)sizeof (conn_req);
2439 if (srclen == 0) {
2440 conn_req.OPT_length = 0;
2441 conn_req.OPT_offset = 0;
2442 soappendmsg(mp, &conn_req, sizeof (conn_req));
2443 soappendmsg(mp, addr, addrlen);
2444 } else {
2445 /*
2446 * There is a AF_UNIX sockaddr_un to include as a source
2447 * address option.
2448 */
2449 struct T_opthdr toh;
2450
2451 toh.level = SOL_SOCKET;
2452 toh.name = SO_SRCADDR;
2453 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
2454 toh.status = 0;
2455 conn_req.OPT_length =
2456 (t_scalar_t)(sizeof (toh) + _TPI_ALIGN_TOPT(srclen));
2457 conn_req.OPT_offset = (t_scalar_t)(sizeof (conn_req) +
2458 _TPI_ALIGN_TOPT(addrlen));
2459
2460 soappendmsg(mp, &conn_req, sizeof (conn_req));
2461 soappendmsg(mp, addr, addrlen);
2462 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2463 soappendmsg(mp, &toh, sizeof (toh));
2464 soappendmsg(mp, src, srclen);
2465 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2466 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2467 }
2468 /*
2469 * Set SS_ISCONNECTING before sending down the T_CONN_REQ
2470 * in order to have the right state when the T_CONN_CON shows up.
2471 */
2472 soisconnecting(so);
2473 mutex_exit(&so->so_lock);
2474
2475 if (AU_AUDITING())
2476 audit_sock(T_CONN_REQ, strvp2wq(SOTOV(so)), mp, 0);
2477
2478 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2479 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR, 0);
2480 mp = NULL;
2481 mutex_enter(&so->so_lock);
2482 if (error != 0)
2483 goto bad;
2484
2485 if ((error = sowaitokack(so, T_CONN_REQ)) != 0)
2486 goto bad;
2487
2488 /* Allow other threads to access the socket */
2489 so_unlock_single(so, SOLOCKED);
2490 need_unlock = B_FALSE;
2491
2492 /*
2493 * Wait until we get a T_CONN_CON or an error
2494 */
2495 if ((error = sowaitconnected(so, fflag, 0)) != 0) {
2496 so_lock_single(so); /* Set SOLOCKED */
2497 need_unlock = B_TRUE;
2498 }
2499
2500 done:
2501 freemsg(mp);
2502 switch (error) {
2503 case EINPROGRESS:
2504 case EALREADY:
2505 case EISCONN:
2506 case EINTR:
2507 /* Non-fatal errors */
2508 sti->sti_laddr_valid = 0;
2509 /* FALLTHRU */
2510 case 0:
2511 break;
2512 default:
2513 ASSERT(need_unlock);
2514 /*
2515 * Fatal errors: clear SS_ISCONNECTING in case it was set,
2516 * and invalidate local-address cache
2517 */
2518 so->so_state &= ~SS_ISCONNECTING;
2519 sti->sti_laddr_valid = 0;
2520 /* A discon_ind might have already unbound us */
2521 if ((flags & _SOCONNECT_DID_BIND) &&
2522 (so->so_state & SS_ISBOUND)) {
2523 int err;
2524
2525 err = sotpi_unbind(so, 0);
2526 /* LINTED - statement has no conseq */
2527 if (err) {
2528 eprintsoline(so, err);
2529 }
2530 }
2531 break;
2532 }
2533 if (need_unlock)
2534 so_unlock_single(so, SOLOCKED);
2535 mutex_exit(&so->so_lock);
2536 return (error);
2537
2538 so_bad: error = sogeterr(so, B_TRUE);
2539 bad: eprintsoline(so, error);
2540 goto done;
2541 }
2542
2543 /* ARGSUSED */
2544 int
sotpi_shutdown(struct sonode * so,int how,struct cred * cr)2545 sotpi_shutdown(struct sonode *so, int how, struct cred *cr)
2546 {
2547 struct T_ordrel_req ordrel_req;
2548 mblk_t *mp;
2549 uint_t old_state, state_change;
2550 int error = 0;
2551 sotpi_info_t *sti = SOTOTPI(so);
2552
2553 dprintso(so, 1, ("sotpi_shutdown(%p, %d) %s\n",
2554 (void *)so, how, pr_state(so->so_state, so->so_mode)));
2555
2556 mutex_enter(&so->so_lock);
2557 so_lock_single(so); /* Set SOLOCKED */
2558
2559 /*
2560 * SunOS 4.X has no check for datagram sockets.
2561 * 5.X checks that it is connected (ENOTCONN)
2562 * X/Open requires that we check the connected state.
2563 */
2564 if (!(so->so_state & SS_ISCONNECTED)) {
2565 if (!xnet_skip_checks) {
2566 error = ENOTCONN;
2567 if (xnet_check_print) {
2568 printf("sockfs: X/Open shutdown check "
2569 "caused ENOTCONN\n");
2570 }
2571 }
2572 goto done;
2573 }
2574 /*
2575 * Record the current state and then perform any state changes.
2576 * Then use the difference between the old and new states to
2577 * determine which messages need to be sent.
2578 * This prevents e.g. duplicate T_ORDREL_REQ when there are
2579 * duplicate calls to shutdown().
2580 */
2581 old_state = so->so_state;
2582
2583 switch (how) {
2584 case 0:
2585 socantrcvmore(so);
2586 break;
2587 case 1:
2588 socantsendmore(so);
2589 break;
2590 case 2:
2591 socantsendmore(so);
2592 socantrcvmore(so);
2593 break;
2594 default:
2595 error = EINVAL;
2596 goto done;
2597 }
2598
2599 /*
2600 * Assumes that the SS_CANT* flags are never cleared in the above code.
2601 */
2602 state_change = (so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) -
2603 (old_state & (SS_CANTRCVMORE|SS_CANTSENDMORE));
2604 ASSERT((state_change & ~(SS_CANTRCVMORE|SS_CANTSENDMORE)) == 0);
2605
2606 switch (state_change) {
2607 case 0:
2608 dprintso(so, 1,
2609 ("sotpi_shutdown: nothing to send in state 0x%x\n",
2610 so->so_state));
2611 goto done;
2612
2613 case SS_CANTRCVMORE:
2614 mutex_exit(&so->so_lock);
2615 strseteof(SOTOV(so), 1);
2616 /*
2617 * strseteof takes care of read side wakeups,
2618 * pollwakeups, and signals.
2619 */
2620 /*
2621 * Get the read lock before flushing data to avoid problems
2622 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2623 */
2624 mutex_enter(&so->so_lock);
2625 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2626 mutex_exit(&so->so_lock);
2627
2628 /* Flush read side queue */
2629 strflushrq(SOTOV(so), FLUSHALL);
2630
2631 mutex_enter(&so->so_lock);
2632 so_unlock_read(so); /* Clear SOREADLOCKED */
2633 break;
2634
2635 case SS_CANTSENDMORE:
2636 mutex_exit(&so->so_lock);
2637 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2638 mutex_enter(&so->so_lock);
2639 break;
2640
2641 case SS_CANTSENDMORE|SS_CANTRCVMORE:
2642 mutex_exit(&so->so_lock);
2643 strsetwerror(SOTOV(so), 0, 0, sogetwrerr);
2644 strseteof(SOTOV(so), 1);
2645 /*
2646 * strseteof takes care of read side wakeups,
2647 * pollwakeups, and signals.
2648 */
2649 /*
2650 * Get the read lock before flushing data to avoid problems
2651 * with the T_EXDATA_IND MSG_PEEK code in sotpi_recvmsg.
2652 */
2653 mutex_enter(&so->so_lock);
2654 (void) so_lock_read(so, 0); /* Set SOREADLOCKED */
2655 mutex_exit(&so->so_lock);
2656
2657 /* Flush read side queue */
2658 strflushrq(SOTOV(so), FLUSHALL);
2659
2660 mutex_enter(&so->so_lock);
2661 so_unlock_read(so); /* Clear SOREADLOCKED */
2662 break;
2663 }
2664
2665 ASSERT(MUTEX_HELD(&so->so_lock));
2666
2667 /*
2668 * If either SS_CANTSENDMORE or SS_CANTRCVMORE or both of them
2669 * was set due to this call and the new state has both of them set:
2670 * Send the AF_UNIX close indication
2671 * For T_COTS send a discon_ind
2672 *
2673 * If cantsend was set due to this call:
2674 * For T_COTSORD send an ordrel_ind
2675 *
2676 * Note that for T_CLTS there is no message sent here.
2677 */
2678 if ((so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
2679 (SS_CANTRCVMORE|SS_CANTSENDMORE)) {
2680 /*
2681 * For SunOS 4.X compatibility we tell the other end
2682 * that we are unable to receive at this point.
2683 */
2684 if (so->so_family == AF_UNIX && sti->sti_serv_type != T_CLTS)
2685 so_unix_close(so);
2686
2687 if (sti->sti_serv_type == T_COTS)
2688 error = sodisconnect(so, -1, _SODISCONNECT_LOCK_HELD);
2689 }
2690 if ((state_change & SS_CANTSENDMORE) &&
2691 (sti->sti_serv_type == T_COTS_ORD)) {
2692 /* Send an orderly release */
2693 ordrel_req.PRIM_type = T_ORDREL_REQ;
2694
2695 mutex_exit(&so->so_lock);
2696 mp = soallocproto1(&ordrel_req, sizeof (ordrel_req),
2697 0, _ALLOC_SLEEP, cr);
2698 /*
2699 * Send down the T_ORDREL_REQ even if there is flow control.
2700 * This prevents shutdown from blocking.
2701 * Note that there is no T_OK_ACK for ordrel_req.
2702 */
2703 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2704 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2705 mutex_enter(&so->so_lock);
2706 if (error) {
2707 eprintsoline(so, error);
2708 goto done;
2709 }
2710 }
2711
2712 done:
2713 so_unlock_single(so, SOLOCKED);
2714 mutex_exit(&so->so_lock);
2715 return (error);
2716 }
2717
2718 /*
2719 * For any connected SOCK_STREAM/SOCK_SEQPACKET AF_UNIX socket we send
2720 * a zero-length T_OPTDATA_REQ with the SO_UNIX_CLOSE option to inform the peer
2721 * that we have closed.
2722 * Also, for connected AF_UNIX SOCK_DGRAM sockets we send a zero-length
2723 * T_UNITDATA_REQ containing the same option.
2724 *
2725 * For SOCK_DGRAM half-connections (somebody connected to this end
2726 * but this end is not connect) we don't know where to send any
2727 * SO_UNIX_CLOSE.
2728 *
2729 * We have to ignore stream head errors just in case there has been
2730 * a shutdown(output).
2731 * Ignore any flow control to try to get the message more quickly to the peer.
2732 * While locally ignoring flow control solves the problem when there
2733 * is only the loopback transport on the stream it would not provide
2734 * the correct AF_UNIX socket semantics when one or more modules have
2735 * been pushed.
2736 */
2737 void
so_unix_close(struct sonode * so)2738 so_unix_close(struct sonode *so)
2739 {
2740 struct T_opthdr toh;
2741 mblk_t *mp;
2742 sotpi_info_t *sti = SOTOTPI(so);
2743
2744 ASSERT(MUTEX_HELD(&so->so_lock));
2745
2746 ASSERT(so->so_family == AF_UNIX);
2747
2748 if ((so->so_state & (SS_ISCONNECTED|SS_ISBOUND)) !=
2749 (SS_ISCONNECTED|SS_ISBOUND))
2750 return;
2751
2752 dprintso(so, 1, ("so_unix_close(%p) %s\n",
2753 (void *)so, pr_state(so->so_state, so->so_mode)));
2754
2755 toh.level = SOL_SOCKET;
2756 toh.name = SO_UNIX_CLOSE;
2757
2758 /* zero length + header */
2759 toh.len = (t_uscalar_t)sizeof (struct T_opthdr);
2760 toh.status = 0;
2761
2762 if (so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET) {
2763 struct T_optdata_req tdr;
2764
2765 tdr.PRIM_type = T_OPTDATA_REQ;
2766 tdr.DATA_flag = 0;
2767
2768 tdr.OPT_length = (t_scalar_t)sizeof (toh);
2769 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
2770
2771 /* NOTE: holding so_lock while sleeping */
2772 mp = soallocproto2(&tdr, sizeof (tdr),
2773 &toh, sizeof (toh), 0, _ALLOC_SLEEP, CRED());
2774 } else {
2775 struct T_unitdata_req tudr;
2776 void *addr;
2777 socklen_t addrlen;
2778 void *src;
2779 socklen_t srclen;
2780 struct T_opthdr toh2;
2781 t_scalar_t size;
2782
2783 /*
2784 * We know this is an AF_UNIX connected DGRAM socket.
2785 * We therefore already have the destination address
2786 * in the internal form needed for this send. This is
2787 * similar to the sosend_dgram call later in this file
2788 * when there's no user-specified destination address.
2789 */
2790 if (sti->sti_faddr_noxlate) {
2791 /*
2792 * Already have a transport internal address. Do not
2793 * pass any (transport internal) source address.
2794 */
2795 addr = sti->sti_faddr_sa;
2796 addrlen = (t_uscalar_t)sti->sti_faddr_len;
2797 src = NULL;
2798 srclen = 0;
2799 } else {
2800 /*
2801 * Pass the sockaddr_un source address as an option
2802 * and translate the remote address.
2803 * Holding so_lock thus sti_laddr_sa can not change.
2804 */
2805 src = sti->sti_laddr_sa;
2806 srclen = (socklen_t)sti->sti_laddr_len;
2807 dprintso(so, 1,
2808 ("so_ux_close: srclen %d, src %p\n",
2809 srclen, src));
2810 /*
2811 * Use the destination address saved in connect.
2812 */
2813 addr = &sti->sti_ux_faddr;
2814 addrlen = sizeof (sti->sti_ux_faddr);
2815 }
2816 tudr.PRIM_type = T_UNITDATA_REQ;
2817 tudr.DEST_length = addrlen;
2818 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
2819 if (srclen == 0) {
2820 tudr.OPT_length = (t_scalar_t)sizeof (toh);
2821 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2822 _TPI_ALIGN_TOPT(addrlen));
2823
2824 size = tudr.OPT_offset + tudr.OPT_length;
2825 /* NOTE: holding so_lock while sleeping */
2826 mp = soallocproto2(&tudr, sizeof (tudr),
2827 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2828 mp->b_wptr += (_TPI_ALIGN_TOPT(addrlen) - addrlen);
2829 soappendmsg(mp, &toh, sizeof (toh));
2830 } else {
2831 /*
2832 * There is a AF_UNIX sockaddr_un to include as a
2833 * source address option.
2834 */
2835 tudr.OPT_length = (t_scalar_t)(2 * sizeof (toh) +
2836 _TPI_ALIGN_TOPT(srclen));
2837 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
2838 _TPI_ALIGN_TOPT(addrlen));
2839
2840 toh2.level = SOL_SOCKET;
2841 toh2.name = SO_SRCADDR;
2842 toh2.len = (t_uscalar_t)(srclen +
2843 sizeof (struct T_opthdr));
2844 toh2.status = 0;
2845
2846 size = tudr.OPT_offset + tudr.OPT_length;
2847
2848 /* NOTE: holding so_lock while sleeping */
2849 mp = soallocproto2(&tudr, sizeof (tudr),
2850 addr, addrlen, size, _ALLOC_SLEEP, CRED());
2851 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
2852 soappendmsg(mp, &toh, sizeof (toh));
2853 soappendmsg(mp, &toh2, sizeof (toh2));
2854 soappendmsg(mp, src, srclen);
2855 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
2856 }
2857 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
2858 }
2859 mutex_exit(&so->so_lock);
2860 (void) kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
2861 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
2862 mutex_enter(&so->so_lock);
2863 }
2864
2865 /*
2866 * Called by sotpi_recvmsg when reading a non-zero amount of data.
2867 * In addition, the caller typically verifies that there is some
2868 * potential state to clear by checking
2869 * if (so->so_state & (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK))
2870 * before calling this routine.
2871 * Note that such a check can be made without holding so_lock since
2872 * sotpi_recvmsg is single-threaded (using SOREADLOCKED) and only sotpi_recvmsg
2873 * decrements sti_oobsigcnt.
2874 *
2875 * When data is read *after* the point that all pending
2876 * oob data has been consumed the oob indication is cleared.
2877 *
2878 * This logic keeps select/poll returning POLLRDBAND and
2879 * SIOCATMARK returning true until we have read past
2880 * the mark.
2881 */
2882 static void
sorecv_update_oobstate(struct sonode * so)2883 sorecv_update_oobstate(struct sonode *so)
2884 {
2885 sotpi_info_t *sti = SOTOTPI(so);
2886
2887 mutex_enter(&so->so_lock);
2888 ASSERT(so_verify_oobstate(so));
2889 dprintso(so, 1,
2890 ("sorecv_update_oobstate: counts %d/%d state %s\n",
2891 sti->sti_oobsigcnt,
2892 sti->sti_oobcnt, pr_state(so->so_state, so->so_mode)));
2893 if (sti->sti_oobsigcnt == 0) {
2894 /* No more pending oob indications */
2895 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
2896 freemsg(so->so_oobmsg);
2897 so->so_oobmsg = NULL;
2898 }
2899 ASSERT(so_verify_oobstate(so));
2900 mutex_exit(&so->so_lock);
2901 }
2902
2903 /*
2904 * Receive the next message on the queue.
2905 * If msg_controllen is non-zero when called the caller is interested in
2906 * any received control info (options).
2907 * If msg_namelen is non-zero when called the caller is interested in
2908 * any received source address.
2909 * The routine returns with msg_control and msg_name pointing to
2910 * kmem_alloc'ed memory which the caller has to free.
2911 */
2912 /* ARGSUSED */
2913 int
sotpi_recvmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)2914 sotpi_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
2915 struct cred *cr)
2916 {
2917 union T_primitives *tpr;
2918 mblk_t *mp;
2919 uchar_t pri;
2920 int pflag, opflag;
2921 void *control;
2922 t_uscalar_t controllen;
2923 t_uscalar_t namelen;
2924 int so_state = so->so_state; /* Snapshot */
2925 ssize_t saved_resid;
2926 rval_t rval;
2927 int flags;
2928 clock_t timout;
2929 int error = 0;
2930 sotpi_info_t *sti = SOTOTPI(so);
2931
2932 flags = msg->msg_flags;
2933 msg->msg_flags = 0;
2934
2935 dprintso(so, 1, ("sotpi_recvmsg(%p, %p, 0x%x) state %s err %d\n",
2936 (void *)so, (void *)msg, flags,
2937 pr_state(so->so_state, so->so_mode), so->so_error));
2938
2939 if (so->so_version == SOV_STREAM) {
2940 so_update_attrs(so, SOACC);
2941 /* The imaginary "sockmod" has been popped - act as a stream */
2942 return (strread(SOTOV(so), uiop, cr));
2943 }
2944
2945 /*
2946 * If we are not connected because we have never been connected
2947 * we return ENOTCONN. If we have been connected (but are no longer
2948 * connected) then SS_CANTRCVMORE is set and we let kstrgetmsg return
2949 * the EOF.
2950 *
2951 * An alternative would be to post an ENOTCONN error in stream head
2952 * (read+write) and clear it when we're connected. However, that error
2953 * would cause incorrect poll/select behavior!
2954 */
2955 if ((so_state & (SS_ISCONNECTED|SS_CANTRCVMORE)) == 0 &&
2956 (so->so_mode & SM_CONNREQUIRED)) {
2957 return (ENOTCONN);
2958 }
2959
2960 /*
2961 * Note: SunOS 4.X checks uio_resid == 0 before going to sleep (but
2962 * after checking that the read queue is empty) and returns zero.
2963 * This implementation will sleep (in kstrgetmsg) even if uio_resid
2964 * is zero.
2965 */
2966
2967 if (flags & MSG_OOB) {
2968 /* Check that the transport supports OOB */
2969 if (!(so->so_mode & SM_EXDATA))
2970 return (EOPNOTSUPP);
2971 so_update_attrs(so, SOACC);
2972 return (sorecvoob(so, msg, uiop, flags,
2973 (so->so_options & SO_OOBINLINE)));
2974 }
2975
2976 so_update_attrs(so, SOACC);
2977
2978 /*
2979 * Set msg_controllen and msg_namelen to zero here to make it
2980 * simpler in the cases that no control or name is returned.
2981 */
2982 controllen = msg->msg_controllen;
2983 namelen = msg->msg_namelen;
2984 msg->msg_controllen = 0;
2985 msg->msg_namelen = 0;
2986
2987 dprintso(so, 1, ("sotpi_recvmsg: namelen %d controllen %d\n",
2988 namelen, controllen));
2989
2990 mutex_enter(&so->so_lock);
2991 /*
2992 * Only one reader is allowed at any given time. This is needed
2993 * for T_EXDATA handling and, in the future, MSG_WAITALL.
2994 *
2995 * This is slightly different that BSD behavior in that it fails with
2996 * EWOULDBLOCK when using nonblocking io. In BSD the read queue access
2997 * is single-threaded using sblock(), which is dropped while waiting
2998 * for data to appear. The difference shows up e.g. if one
2999 * file descriptor does not have O_NONBLOCK but a dup'ed file descriptor
3000 * does use nonblocking io and different threads are reading each
3001 * file descriptor. In BSD there would never be an EWOULDBLOCK error
3002 * in this case as long as the read queue doesn't get empty.
3003 * In this implementation the thread using nonblocking io can
3004 * get an EWOULDBLOCK error due to the blocking thread executing
3005 * e.g. in the uiomove in kstrgetmsg.
3006 * This difference is not believed to be significant.
3007 */
3008 /* Set SOREADLOCKED */
3009 error = so_lock_read_intr(so,
3010 uiop->uio_fmode | ((flags & MSG_DONTWAIT) ? FNONBLOCK : 0));
3011 mutex_exit(&so->so_lock);
3012 if (error)
3013 return (error);
3014
3015 /*
3016 * Tell kstrgetmsg to not inspect the stream head errors until all
3017 * queued data has been consumed.
3018 * Use a timeout=-1 to wait forever unless MSG_DONTWAIT is set.
3019 * Also, If uio_fmode indicates nonblocking kstrgetmsg will not block.
3020 *
3021 * MSG_WAITALL only applies to M_DATA and T_DATA_IND messages and
3022 * to T_OPTDATA_IND that do not contain any user-visible control msg.
3023 * Note that MSG_WAITALL set with MSG_PEEK is a noop.
3024 */
3025 pflag = MSG_ANY | MSG_DELAYERROR;
3026 if (flags & MSG_PEEK) {
3027 pflag |= MSG_IPEEK;
3028 flags &= ~MSG_WAITALL;
3029 }
3030 if (so->so_mode & SM_ATOMIC)
3031 pflag |= MSG_DISCARDTAIL;
3032
3033 if (flags & MSG_DONTWAIT)
3034 timout = 0;
3035 else if (so->so_rcvtimeo != 0)
3036 timout = TICK_TO_MSEC(so->so_rcvtimeo);
3037 else
3038 timout = -1;
3039 opflag = pflag;
3040 retry:
3041 saved_resid = uiop->uio_resid;
3042 pri = 0;
3043 mp = NULL;
3044 error = kstrgetmsg(SOTOV(so), &mp, uiop, &pri, &pflag,
3045 timout, &rval);
3046 if (error != 0) {
3047 /* kstrgetmsg returns ETIME when timeout expires */
3048 if (error == ETIME)
3049 error = EWOULDBLOCK;
3050 goto out;
3051 }
3052 /*
3053 * For datagrams the MOREDATA flag is used to set MSG_TRUNC.
3054 * For non-datagrams MOREDATA is used to set MSG_EOR.
3055 */
3056 ASSERT(!(rval.r_val1 & MORECTL));
3057 if ((rval.r_val1 & MOREDATA) && (so->so_mode & SM_ATOMIC))
3058 msg->msg_flags |= MSG_TRUNC;
3059
3060 if (mp == NULL) {
3061 dprintso(so, 1, ("sotpi_recvmsg: got M_DATA\n"));
3062 /*
3063 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
3064 * The draft Posix socket spec states that the mark should
3065 * not be cleared when peeking. We follow the latter.
3066 */
3067 if ((so->so_state &
3068 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3069 (uiop->uio_resid != saved_resid) &&
3070 !(flags & MSG_PEEK)) {
3071 sorecv_update_oobstate(so);
3072 }
3073
3074 mutex_enter(&so->so_lock);
3075 /* Set MSG_EOR based on MOREDATA */
3076 if (!(rval.r_val1 & MOREDATA)) {
3077 if (so->so_state & SS_SAVEDEOR) {
3078 msg->msg_flags |= MSG_EOR;
3079 so->so_state &= ~SS_SAVEDEOR;
3080 }
3081 }
3082 /*
3083 * If some data was received (i.e. not EOF) and the
3084 * read/recv* has not been satisfied wait for some more.
3085 */
3086 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3087 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3088 mutex_exit(&so->so_lock);
3089 pflag = opflag | MSG_NOMARK;
3090 goto retry;
3091 }
3092 goto out_locked;
3093 }
3094
3095 /* strsock_proto has already verified length and alignment */
3096 tpr = (union T_primitives *)mp->b_rptr;
3097 dprintso(so, 1, ("sotpi_recvmsg: type %d\n", tpr->type));
3098
3099 switch (tpr->type) {
3100 case T_DATA_IND: {
3101 if ((so->so_state &
3102 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3103 (uiop->uio_resid != saved_resid) &&
3104 !(flags & MSG_PEEK)) {
3105 sorecv_update_oobstate(so);
3106 }
3107
3108 /*
3109 * Set msg_flags to MSG_EOR based on
3110 * MORE_flag and MOREDATA.
3111 */
3112 mutex_enter(&so->so_lock);
3113 so->so_state &= ~SS_SAVEDEOR;
3114 if (!(tpr->data_ind.MORE_flag & 1)) {
3115 if (!(rval.r_val1 & MOREDATA))
3116 msg->msg_flags |= MSG_EOR;
3117 else
3118 so->so_state |= SS_SAVEDEOR;
3119 }
3120 freemsg(mp);
3121 /*
3122 * If some data was received (i.e. not EOF) and the
3123 * read/recv* has not been satisfied wait for some more.
3124 */
3125 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3126 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3127 mutex_exit(&so->so_lock);
3128 pflag = opflag | MSG_NOMARK;
3129 goto retry;
3130 }
3131 goto out_locked;
3132 }
3133 case T_UNITDATA_IND: {
3134 void *addr;
3135 t_uscalar_t addrlen;
3136 void *abuf;
3137 t_uscalar_t optlen;
3138 void *opt;
3139
3140 if ((so->so_state &
3141 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3142 (uiop->uio_resid != saved_resid) &&
3143 !(flags & MSG_PEEK)) {
3144 sorecv_update_oobstate(so);
3145 }
3146
3147 if (namelen != 0) {
3148 /* Caller wants source address */
3149 addrlen = tpr->unitdata_ind.SRC_length;
3150 addr = sogetoff(mp,
3151 tpr->unitdata_ind.SRC_offset,
3152 addrlen, 1);
3153 if (addr == NULL) {
3154 freemsg(mp);
3155 error = EPROTO;
3156 eprintsoline(so, error);
3157 goto out;
3158 }
3159 if (so->so_family == AF_UNIX) {
3160 /*
3161 * Can not use the transport level address.
3162 * If there is a SO_SRCADDR option carrying
3163 * the socket level address it will be
3164 * extracted below.
3165 */
3166 addr = NULL;
3167 addrlen = 0;
3168 }
3169 }
3170 optlen = tpr->unitdata_ind.OPT_length;
3171 if (optlen != 0) {
3172 t_uscalar_t ncontrollen;
3173
3174 /*
3175 * Extract any source address option.
3176 * Determine how large cmsg buffer is needed.
3177 */
3178 opt = sogetoff(mp,
3179 tpr->unitdata_ind.OPT_offset,
3180 optlen, __TPI_ALIGN_SIZE);
3181
3182 if (opt == NULL) {
3183 freemsg(mp);
3184 error = EPROTO;
3185 eprintsoline(so, error);
3186 goto out;
3187 }
3188 if (so->so_family == AF_UNIX)
3189 so_getopt_srcaddr(opt, optlen, &addr, &addrlen);
3190 ncontrollen = so_cmsglen(mp, opt, optlen,
3191 !(flags & MSG_XPG4_2));
3192 if (controllen != 0)
3193 controllen = ncontrollen;
3194 else if (ncontrollen != 0)
3195 msg->msg_flags |= MSG_CTRUNC;
3196 } else {
3197 controllen = 0;
3198 }
3199
3200 if (namelen != 0) {
3201 /*
3202 * Return address to caller.
3203 * Caller handles truncation if length
3204 * exceeds msg_namelen.
3205 * NOTE: AF_UNIX NUL termination is ensured by
3206 * the sender's copyin_name().
3207 */
3208 abuf = kmem_alloc(addrlen, KM_SLEEP);
3209
3210 bcopy(addr, abuf, addrlen);
3211 msg->msg_name = abuf;
3212 msg->msg_namelen = addrlen;
3213 }
3214
3215 if (controllen != 0) {
3216 /*
3217 * Return control msg to caller.
3218 * Caller handles truncation if length
3219 * exceeds msg_controllen.
3220 */
3221 control = kmem_zalloc(controllen, KM_SLEEP);
3222
3223 error = so_opt2cmsg(mp, opt, optlen, flags, control,
3224 controllen);
3225 if (error) {
3226 freemsg(mp);
3227 if (msg->msg_namelen != 0)
3228 kmem_free(msg->msg_name,
3229 msg->msg_namelen);
3230 kmem_free(control, controllen);
3231 eprintsoline(so, error);
3232 goto out;
3233 }
3234 msg->msg_control = control;
3235 msg->msg_controllen = controllen;
3236 }
3237
3238 freemsg(mp);
3239 goto out;
3240 }
3241 case T_OPTDATA_IND: {
3242 struct T_optdata_req *tdr;
3243 void *opt;
3244 t_uscalar_t optlen;
3245
3246 if ((so->so_state &
3247 (SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK)) &&
3248 (uiop->uio_resid != saved_resid) &&
3249 !(flags & MSG_PEEK)) {
3250 sorecv_update_oobstate(so);
3251 }
3252
3253 tdr = (struct T_optdata_req *)mp->b_rptr;
3254 optlen = tdr->OPT_length;
3255 if (optlen != 0) {
3256 t_uscalar_t ncontrollen;
3257 /*
3258 * Determine how large cmsg buffer is needed.
3259 */
3260 opt = sogetoff(mp,
3261 tpr->optdata_ind.OPT_offset,
3262 optlen, __TPI_ALIGN_SIZE);
3263
3264 if (opt == NULL) {
3265 freemsg(mp);
3266 error = EPROTO;
3267 eprintsoline(so, error);
3268 goto out;
3269 }
3270
3271 ncontrollen = so_cmsglen(mp, opt, optlen,
3272 !(flags & MSG_XPG4_2));
3273 if (controllen != 0)
3274 controllen = ncontrollen;
3275 else if (ncontrollen != 0)
3276 msg->msg_flags |= MSG_CTRUNC;
3277 } else {
3278 controllen = 0;
3279 }
3280
3281 if (controllen != 0) {
3282 /*
3283 * Return control msg to caller.
3284 * Caller handles truncation if length
3285 * exceeds msg_controllen.
3286 */
3287 control = kmem_zalloc(controllen, KM_SLEEP);
3288
3289 error = so_opt2cmsg(mp, opt, optlen, flags, control,
3290 controllen);
3291 if (error) {
3292 freemsg(mp);
3293 kmem_free(control, controllen);
3294 eprintsoline(so, error);
3295 goto out;
3296 }
3297 msg->msg_control = control;
3298 msg->msg_controllen = controllen;
3299 }
3300
3301 /*
3302 * Set msg_flags to MSG_EOR based on
3303 * DATA_flag and MOREDATA.
3304 */
3305 mutex_enter(&so->so_lock);
3306 so->so_state &= ~SS_SAVEDEOR;
3307 if (!(tpr->data_ind.MORE_flag & 1)) {
3308 if (!(rval.r_val1 & MOREDATA))
3309 msg->msg_flags |= MSG_EOR;
3310 else
3311 so->so_state |= SS_SAVEDEOR;
3312 }
3313 freemsg(mp);
3314 /*
3315 * If some data was received (i.e. not EOF) and the
3316 * read/recv* has not been satisfied wait for some more.
3317 * Not possible to wait if control info was received.
3318 */
3319 if ((flags & MSG_WAITALL) && !(msg->msg_flags & MSG_EOR) &&
3320 controllen == 0 &&
3321 uiop->uio_resid != saved_resid && uiop->uio_resid > 0) {
3322 mutex_exit(&so->so_lock);
3323 pflag = opflag | MSG_NOMARK;
3324 goto retry;
3325 }
3326 goto out_locked;
3327 }
3328 case T_EXDATA_IND: {
3329 dprintso(so, 1,
3330 ("sotpi_recvmsg: EXDATA_IND counts %d/%d consumed %ld "
3331 "state %s\n",
3332 sti->sti_oobsigcnt, sti->sti_oobcnt,
3333 saved_resid - uiop->uio_resid,
3334 pr_state(so->so_state, so->so_mode)));
3335 /*
3336 * kstrgetmsg handles MSGMARK so there is nothing to
3337 * inspect in the T_EXDATA_IND.
3338 * strsock_proto makes the stream head queue the T_EXDATA_IND
3339 * as a separate message with no M_DATA component. Furthermore,
3340 * the stream head does not consolidate M_DATA messages onto
3341 * an MSGMARK'ed message ensuring that the T_EXDATA_IND
3342 * remains a message by itself. This is needed since MSGMARK
3343 * marks both the whole message as well as the last byte
3344 * of the message.
3345 */
3346 freemsg(mp);
3347 ASSERT(uiop->uio_resid == saved_resid); /* No data */
3348 if (flags & MSG_PEEK) {
3349 /*
3350 * Even though we are peeking we consume the
3351 * T_EXDATA_IND thereby moving the mark information
3352 * to SS_RCVATMARK. Then the oob code below will
3353 * retry the peeking kstrgetmsg.
3354 * Note that the stream head read queue is
3355 * never flushed without holding SOREADLOCKED
3356 * thus the T_EXDATA_IND can not disappear
3357 * underneath us.
3358 */
3359 dprintso(so, 1,
3360 ("sotpi_recvmsg: consume EXDATA_IND "
3361 "counts %d/%d state %s\n",
3362 sti->sti_oobsigcnt,
3363 sti->sti_oobcnt,
3364 pr_state(so->so_state, so->so_mode)));
3365
3366 pflag = MSG_ANY | MSG_DELAYERROR;
3367 if (so->so_mode & SM_ATOMIC)
3368 pflag |= MSG_DISCARDTAIL;
3369
3370 pri = 0;
3371 mp = NULL;
3372
3373 error = kstrgetmsg(SOTOV(so), &mp, uiop,
3374 &pri, &pflag, (clock_t)-1, &rval);
3375 ASSERT(uiop->uio_resid == saved_resid);
3376
3377 if (error) {
3378 #ifdef SOCK_DEBUG
3379 if (error != EWOULDBLOCK && error != EINTR) {
3380 eprintsoline(so, error);
3381 }
3382 #endif /* SOCK_DEBUG */
3383 goto out;
3384 }
3385 ASSERT(mp);
3386 tpr = (union T_primitives *)mp->b_rptr;
3387 ASSERT(tpr->type == T_EXDATA_IND);
3388 freemsg(mp);
3389 } /* end "if (flags & MSG_PEEK)" */
3390
3391 /*
3392 * Decrement the number of queued and pending oob.
3393 *
3394 * SS_RCVATMARK is cleared when we read past a mark.
3395 * SS_HAVEOOBDATA is cleared when we've read past the
3396 * last mark.
3397 * SS_OOBPEND is cleared if we've read past the last
3398 * mark and no (new) SIGURG has been posted.
3399 */
3400 mutex_enter(&so->so_lock);
3401 ASSERT(so_verify_oobstate(so));
3402 ASSERT(sti->sti_oobsigcnt >= sti->sti_oobcnt);
3403 ASSERT(sti->sti_oobsigcnt > 0);
3404 sti->sti_oobsigcnt--;
3405 ASSERT(sti->sti_oobcnt > 0);
3406 sti->sti_oobcnt--;
3407 /*
3408 * Since the T_EXDATA_IND has been removed from the stream
3409 * head, but we have not read data past the mark,
3410 * sockfs needs to track that the socket is still at the mark.
3411 *
3412 * Since no data was received call kstrgetmsg again to wait
3413 * for data.
3414 */
3415 so->so_state |= SS_RCVATMARK;
3416 mutex_exit(&so->so_lock);
3417 dprintso(so, 1,
3418 ("sotpi_recvmsg: retry EXDATA_IND counts %d/%d state %s\n",
3419 sti->sti_oobsigcnt, sti->sti_oobcnt,
3420 pr_state(so->so_state, so->so_mode)));
3421 pflag = opflag;
3422 goto retry;
3423 }
3424 default:
3425 cmn_err(CE_CONT, "sotpi_recvmsg: so %p prim %d mp %p\n",
3426 (void *)so, tpr->type, (void *)mp);
3427 ASSERT(0);
3428 freemsg(mp);
3429 error = EPROTO;
3430 eprintsoline(so, error);
3431 goto out;
3432 }
3433 /* NOTREACHED */
3434 out:
3435 mutex_enter(&so->so_lock);
3436 out_locked:
3437 so_unlock_read(so); /* Clear SOREADLOCKED */
3438 mutex_exit(&so->so_lock);
3439 return (error);
3440 }
3441
3442 /*
3443 * Sending data with options on a datagram socket.
3444 * Assumes caller has verified that SS_ISBOUND etc. are set.
3445 *
3446 * For AF_UNIX the destination address may be already in
3447 * internal form, as indicated by sti->sti_faddr_noxlate
3448 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
3449 * translate the destination address to internal form.
3450 *
3451 * The source address is passed as an option. If passing
3452 * file descriptors, those are passed as file pointers in
3453 * another option.
3454 */
3455 static int
sosend_dgramcmsg(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,void * control,t_uscalar_t controllen,int flags)3456 sosend_dgramcmsg(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3457 struct uio *uiop, void *control, t_uscalar_t controllen, int flags)
3458 {
3459 struct T_unitdata_req tudr;
3460 mblk_t *mp;
3461 int error;
3462 void *addr;
3463 socklen_t addrlen;
3464 void *src;
3465 socklen_t srclen;
3466 ssize_t len;
3467 int size;
3468 struct T_opthdr toh;
3469 struct fdbuf *fdbuf;
3470 t_uscalar_t optlen;
3471 void *fds;
3472 int fdlen;
3473 sotpi_info_t *sti = SOTOTPI(so);
3474
3475 ASSERT(name && namelen);
3476 ASSERT(control && controllen);
3477
3478 len = uiop->uio_resid;
3479 if (len > (ssize_t)sti->sti_tidu_size) {
3480 return (EMSGSIZE);
3481 }
3482
3483 if (sti->sti_faddr_noxlate == 0 &&
3484 (flags & MSG_SENDTO_NOXLATE) == 0) {
3485 /*
3486 * Length and family checks.
3487 * Don't verify internal form.
3488 */
3489 error = so_addr_verify(so, name, namelen);
3490 if (error) {
3491 eprintsoline(so, error);
3492 return (error);
3493 }
3494 }
3495
3496 if (so->so_family == AF_UNIX) {
3497 if (sti->sti_faddr_noxlate) {
3498 /*
3499 * Already have a transport internal address. Do not
3500 * pass any (transport internal) source address.
3501 */
3502 addr = name;
3503 addrlen = namelen;
3504 src = NULL;
3505 srclen = 0;
3506 } else if (flags & MSG_SENDTO_NOXLATE) {
3507 /*
3508 * Have an internal form dest. address.
3509 * Pass the source address as usual.
3510 */
3511 addr = name;
3512 addrlen = namelen;
3513 src = sti->sti_laddr_sa;
3514 srclen = (socklen_t)sti->sti_laddr_len;
3515 } else {
3516 /*
3517 * Pass the sockaddr_un source address as an option
3518 * and translate the remote address.
3519 *
3520 * Note that this code does not prevent sti_laddr_sa
3521 * from changing while it is being used. Thus
3522 * if an unbind+bind occurs concurrently with this
3523 * send the peer might see a partially new and a
3524 * partially old "from" address.
3525 */
3526 src = sti->sti_laddr_sa;
3527 srclen = (socklen_t)sti->sti_laddr_len;
3528 dprintso(so, 1,
3529 ("sosend_dgramcmsg UNIX: srclen %d, src %p\n",
3530 srclen, src));
3531 /*
3532 * The sendmsg caller specified a destination
3533 * address, which we must translate into our
3534 * internal form. addr = &sti->sti_ux_taddr
3535 */
3536 error = so_ux_addr_xlate(so, name, namelen,
3537 (flags & MSG_XPG4_2),
3538 &addr, &addrlen);
3539 if (error) {
3540 eprintsoline(so, error);
3541 return (error);
3542 }
3543 }
3544 } else {
3545 addr = name;
3546 addrlen = namelen;
3547 src = NULL;
3548 srclen = 0;
3549 }
3550 optlen = so_optlen(control, controllen,
3551 !(flags & MSG_XPG4_2));
3552 tudr.PRIM_type = T_UNITDATA_REQ;
3553 tudr.DEST_length = addrlen;
3554 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3555 if (srclen != 0)
3556 tudr.OPT_length = (t_scalar_t)(optlen + sizeof (toh) +
3557 _TPI_ALIGN_TOPT(srclen));
3558 else
3559 tudr.OPT_length = optlen;
3560 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3561 _TPI_ALIGN_TOPT(addrlen));
3562
3563 size = tudr.OPT_offset + tudr.OPT_length;
3564
3565 /*
3566 * File descriptors only when SM_FDPASSING set.
3567 */
3568 error = so_getfdopt(control, controllen,
3569 !(flags & MSG_XPG4_2), &fds, &fdlen);
3570 if (error)
3571 return (error);
3572 if (fdlen != -1) {
3573 if (!(so->so_mode & SM_FDPASSING))
3574 return (EOPNOTSUPP);
3575
3576 error = fdbuf_create(fds, fdlen, &fdbuf);
3577 if (error)
3578 return (error);
3579
3580 /*
3581 * Pre-allocate enough additional space for lower level modules
3582 * to append an option (e.g. see tl_unitdata). The following
3583 * is enough extra space for the largest option we might append.
3584 */
3585 size += sizeof (struct T_opthdr) + ucredsize;
3586 mp = fdbuf_allocmsg(size, fdbuf);
3587 } else {
3588 mp = soallocproto(size, _ALLOC_INTR, CRED());
3589 if (mp == NULL) {
3590 /*
3591 * Caught a signal waiting for memory.
3592 * Let send* return EINTR.
3593 */
3594 return (EINTR);
3595 }
3596 }
3597 soappendmsg(mp, &tudr, sizeof (tudr));
3598 soappendmsg(mp, addr, addrlen);
3599 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3600
3601 if (fdlen != -1) {
3602 ASSERT(fdbuf != NULL);
3603 toh.level = SOL_SOCKET;
3604 toh.name = SO_FILEP;
3605 toh.len = fdbuf->fd_size +
3606 (t_uscalar_t)sizeof (struct T_opthdr);
3607 toh.status = 0;
3608 soappendmsg(mp, &toh, sizeof (toh));
3609 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3610 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3611 }
3612 if (srclen != 0) {
3613 /*
3614 * There is a AF_UNIX sockaddr_un to include as a source
3615 * address option.
3616 */
3617 toh.level = SOL_SOCKET;
3618 toh.name = SO_SRCADDR;
3619 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3620 toh.status = 0;
3621 soappendmsg(mp, &toh, sizeof (toh));
3622 soappendmsg(mp, src, srclen);
3623 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3624 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3625 }
3626 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3627 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3628 /*
3629 * Normally at most 3 bytes left in the message, but we might have
3630 * allowed for extra space if we're passing fd's through.
3631 */
3632 ASSERT(MBLKL(mp) <= (ssize_t)size);
3633
3634 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3635 if (AU_AUDITING())
3636 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3637
3638 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3639 #ifdef SOCK_DEBUG
3640 if (error) {
3641 eprintsoline(so, error);
3642 }
3643 #endif /* SOCK_DEBUG */
3644 return (error);
3645 }
3646
3647 /*
3648 * Sending data with options on a connected stream socket.
3649 * Assumes caller has verified that SS_ISCONNECTED is set.
3650 */
3651 static int
sosend_svccmsg(struct sonode * so,struct uio * uiop,int more,void * control,t_uscalar_t controllen,int flags)3652 sosend_svccmsg(struct sonode *so, struct uio *uiop, int more, void *control,
3653 t_uscalar_t controllen, int flags)
3654 {
3655 struct T_optdata_req tdr;
3656 mblk_t *mp;
3657 int error;
3658 ssize_t iosize;
3659 int size;
3660 struct fdbuf *fdbuf;
3661 t_uscalar_t optlen;
3662 void *fds;
3663 int fdlen;
3664 struct T_opthdr toh;
3665 sotpi_info_t *sti = SOTOTPI(so);
3666
3667 dprintso(so, 1,
3668 ("sosend_svccmsg: resid %ld bytes\n", uiop->uio_resid));
3669
3670 /*
3671 * Has to be bound and connected. However, since no locks are
3672 * held the state could have changed after sotpi_sendmsg checked it
3673 * thus it is not possible to ASSERT on the state.
3674 */
3675
3676 /* Options on connection-oriented only when SM_OPTDATA set. */
3677 if (!(so->so_mode & SM_OPTDATA))
3678 return (EOPNOTSUPP);
3679
3680 do {
3681 /*
3682 * Set the MORE flag if uio_resid does not fit in this
3683 * message or if the caller passed in "more".
3684 * Error for transports with zero tidu_size.
3685 */
3686 tdr.PRIM_type = T_OPTDATA_REQ;
3687 iosize = sti->sti_tidu_size;
3688 if (iosize <= 0)
3689 return (EMSGSIZE);
3690 if (uiop->uio_resid > iosize) {
3691 tdr.DATA_flag = 1;
3692 } else {
3693 if (more)
3694 tdr.DATA_flag = 1;
3695 else
3696 tdr.DATA_flag = 0;
3697 iosize = uiop->uio_resid;
3698 }
3699 dprintso(so, 1, ("sosend_svccmsg: sending %d, %ld bytes\n",
3700 tdr.DATA_flag, iosize));
3701
3702 optlen = so_optlen(control, controllen, !(flags & MSG_XPG4_2));
3703 tdr.OPT_length = optlen;
3704 tdr.OPT_offset = (t_scalar_t)sizeof (tdr);
3705
3706 size = (int)sizeof (tdr) + optlen;
3707 /*
3708 * File descriptors only when SM_FDPASSING set.
3709 */
3710 error = so_getfdopt(control, controllen,
3711 !(flags & MSG_XPG4_2), &fds, &fdlen);
3712 if (error)
3713 return (error);
3714 if (fdlen != -1) {
3715 if (!(so->so_mode & SM_FDPASSING))
3716 return (EOPNOTSUPP);
3717
3718 error = fdbuf_create(fds, fdlen, &fdbuf);
3719 if (error)
3720 return (error);
3721
3722 /*
3723 * Pre-allocate enough additional space for lower level
3724 * modules to append an option (e.g. see tl_unitdata).
3725 * The following is enough extra space for the largest
3726 * option we might append.
3727 */
3728 size += sizeof (struct T_opthdr) + ucredsize;
3729 mp = fdbuf_allocmsg(size, fdbuf);
3730 } else {
3731 mp = soallocproto(size, _ALLOC_INTR, CRED());
3732 if (mp == NULL) {
3733 /*
3734 * Caught a signal waiting for memory.
3735 * Let send* return EINTR.
3736 */
3737 return (EINTR);
3738 }
3739 }
3740 soappendmsg(mp, &tdr, sizeof (tdr));
3741
3742 if (fdlen != -1) {
3743 ASSERT(fdbuf != NULL);
3744 toh.level = SOL_SOCKET;
3745 toh.name = SO_FILEP;
3746 toh.len = fdbuf->fd_size +
3747 (t_uscalar_t)sizeof (struct T_opthdr);
3748 toh.status = 0;
3749 soappendmsg(mp, &toh, sizeof (toh));
3750 soappendmsg(mp, fdbuf, fdbuf->fd_size);
3751 ASSERT(__TPI_TOPT_ISALIGNED(mp->b_wptr));
3752 }
3753 so_cmsg2opt(control, controllen, !(flags & MSG_XPG4_2), mp);
3754 /*
3755 * Normally at most 3 bytes left in the message, but we might
3756 * have allowed for extra space if we're passing fd's through.
3757 */
3758 ASSERT(MBLKL(mp) <= (ssize_t)size);
3759
3760 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3761
3762 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
3763 0, MSG_BAND, 0);
3764 if (error) {
3765 eprintsoline(so, error);
3766 return (error);
3767 }
3768 control = NULL;
3769 if (uiop->uio_resid > 0) {
3770 /*
3771 * Recheck for fatal errors. Fail write even though
3772 * some data have been written. This is consistent
3773 * with strwrite semantics and BSD sockets semantics.
3774 */
3775 if (so->so_state & SS_CANTSENDMORE) {
3776 eprintsoline(so, error);
3777 return (EPIPE);
3778 }
3779 if (so->so_error != 0) {
3780 mutex_enter(&so->so_lock);
3781 error = sogeterr(so, B_TRUE);
3782 mutex_exit(&so->so_lock);
3783 if (error != 0) {
3784 eprintsoline(so, error);
3785 return (error);
3786 }
3787 }
3788 }
3789 } while (uiop->uio_resid > 0);
3790 return (0);
3791 }
3792
3793 /*
3794 * Sending data on a datagram socket.
3795 * Assumes caller has verified that SS_ISBOUND etc. are set.
3796 *
3797 * For AF_UNIX the destination address may be already in
3798 * internal form, as indicated by sti->sti_faddr_noxlate
3799 * or the MSG_SENDTO_NOXLATE flag. Otherwise we need to
3800 * translate the destination address to internal form.
3801 *
3802 * The source address is passed as an option.
3803 */
3804 int
sosend_dgram(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)3805 sosend_dgram(struct sonode *so, struct sockaddr *name, socklen_t namelen,
3806 struct uio *uiop, int flags)
3807 {
3808 struct T_unitdata_req tudr;
3809 mblk_t *mp;
3810 int error;
3811 void *addr;
3812 socklen_t addrlen;
3813 void *src;
3814 socklen_t srclen;
3815 ssize_t len;
3816 sotpi_info_t *sti = SOTOTPI(so);
3817
3818 ASSERT(name != NULL && namelen != 0);
3819
3820 len = uiop->uio_resid;
3821 if (len > sti->sti_tidu_size) {
3822 error = EMSGSIZE;
3823 goto done;
3824 }
3825
3826 if (sti->sti_faddr_noxlate == 0 &&
3827 (flags & MSG_SENDTO_NOXLATE) == 0) {
3828 /*
3829 * Length and family checks.
3830 * Don't verify internal form.
3831 */
3832 error = so_addr_verify(so, name, namelen);
3833 if (error != 0)
3834 goto done;
3835 }
3836
3837 if (sti->sti_direct) /* Never on AF_UNIX */
3838 return (sodgram_direct(so, name, namelen, uiop, flags));
3839
3840 if (so->so_family == AF_UNIX) {
3841 if (sti->sti_faddr_noxlate) {
3842 /*
3843 * Already have a transport internal address. Do not
3844 * pass any (transport internal) source address.
3845 */
3846 addr = name;
3847 addrlen = namelen;
3848 src = NULL;
3849 srclen = 0;
3850 } else if (flags & MSG_SENDTO_NOXLATE) {
3851 /*
3852 * Have an internal form dest. address.
3853 * Pass the source address as usual.
3854 */
3855 addr = name;
3856 addrlen = namelen;
3857 src = sti->sti_laddr_sa;
3858 srclen = (socklen_t)sti->sti_laddr_len;
3859 } else {
3860 /*
3861 * Pass the sockaddr_un source address as an option
3862 * and translate the remote address.
3863 *
3864 * Note that this code does not prevent sti_laddr_sa
3865 * from changing while it is being used. Thus
3866 * if an unbind+bind occurs concurrently with this
3867 * send the peer might see a partially new and a
3868 * partially old "from" address.
3869 */
3870 src = sti->sti_laddr_sa;
3871 srclen = (socklen_t)sti->sti_laddr_len;
3872 dprintso(so, 1,
3873 ("sosend_dgram UNIX: srclen %d, src %p\n",
3874 srclen, src));
3875 /*
3876 * The sendmsg caller specified a destination
3877 * address, which we must translate into our
3878 * internal form. addr = &sti->sti_ux_taddr
3879 */
3880 error = so_ux_addr_xlate(so, name, namelen,
3881 (flags & MSG_XPG4_2),
3882 &addr, &addrlen);
3883 if (error) {
3884 eprintsoline(so, error);
3885 goto done;
3886 }
3887 }
3888 } else {
3889 addr = name;
3890 addrlen = namelen;
3891 src = NULL;
3892 srclen = 0;
3893 }
3894 tudr.PRIM_type = T_UNITDATA_REQ;
3895 tudr.DEST_length = addrlen;
3896 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
3897 if (srclen == 0) {
3898 tudr.OPT_length = 0;
3899 tudr.OPT_offset = 0;
3900
3901 mp = soallocproto2(&tudr, sizeof (tudr),
3902 addr, addrlen, 0, _ALLOC_INTR, CRED());
3903 if (mp == NULL) {
3904 /*
3905 * Caught a signal waiting for memory.
3906 * Let send* return EINTR.
3907 */
3908 error = EINTR;
3909 goto done;
3910 }
3911 } else {
3912 /*
3913 * There is a AF_UNIX sockaddr_un to include as a source
3914 * address option.
3915 */
3916 struct T_opthdr toh;
3917 ssize_t size;
3918
3919 tudr.OPT_length = (t_scalar_t)(sizeof (toh) +
3920 _TPI_ALIGN_TOPT(srclen));
3921 tudr.OPT_offset = (t_scalar_t)(sizeof (tudr) +
3922 _TPI_ALIGN_TOPT(addrlen));
3923
3924 toh.level = SOL_SOCKET;
3925 toh.name = SO_SRCADDR;
3926 toh.len = (t_uscalar_t)(srclen + sizeof (struct T_opthdr));
3927 toh.status = 0;
3928
3929 size = tudr.OPT_offset + tudr.OPT_length;
3930 mp = soallocproto2(&tudr, sizeof (tudr),
3931 addr, addrlen, size, _ALLOC_INTR, CRED());
3932 if (mp == NULL) {
3933 /*
3934 * Caught a signal waiting for memory.
3935 * Let send* return EINTR.
3936 */
3937 error = EINTR;
3938 goto done;
3939 }
3940 mp->b_wptr += _TPI_ALIGN_TOPT(addrlen) - addrlen;
3941 soappendmsg(mp, &toh, sizeof (toh));
3942 soappendmsg(mp, src, srclen);
3943 mp->b_wptr += _TPI_ALIGN_TOPT(srclen) - srclen;
3944 ASSERT(mp->b_wptr <= mp->b_datap->db_lim);
3945 }
3946
3947 if (AU_AUDITING())
3948 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
3949
3950 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
3951 done:
3952 #ifdef SOCK_DEBUG
3953 if (error) {
3954 eprintsoline(so, error);
3955 }
3956 #endif /* SOCK_DEBUG */
3957 return (error);
3958 }
3959
3960 /*
3961 * Sending data on a connected stream socket.
3962 * Assumes caller has verified that SS_ISCONNECTED is set.
3963 */
3964 int
sosend_svc(struct sonode * so,struct uio * uiop,t_scalar_t prim,int more,int sflag)3965 sosend_svc(struct sonode *so, struct uio *uiop, t_scalar_t prim, int more,
3966 int sflag)
3967 {
3968 struct T_data_req tdr;
3969 mblk_t *mp;
3970 int error;
3971 ssize_t iosize;
3972 sotpi_info_t *sti = SOTOTPI(so);
3973
3974 dprintso(so, 1,
3975 ("sosend_svc: %p, resid %ld bytes, prim %d, sflag 0x%x\n",
3976 (void *)so, uiop->uio_resid, prim, sflag));
3977
3978 /*
3979 * Has to be bound and connected. However, since no locks are
3980 * held the state could have changed after sotpi_sendmsg checked it
3981 * thus it is not possible to ASSERT on the state.
3982 */
3983
3984 do {
3985 /*
3986 * Set the MORE flag if uio_resid does not fit in this
3987 * message or if the caller passed in "more".
3988 * Error for transports with zero tidu_size.
3989 */
3990 tdr.PRIM_type = prim;
3991 iosize = sti->sti_tidu_size;
3992 if (iosize <= 0)
3993 return (EMSGSIZE);
3994 if (uiop->uio_resid > iosize) {
3995 tdr.MORE_flag = 1;
3996 } else {
3997 if (more)
3998 tdr.MORE_flag = 1;
3999 else
4000 tdr.MORE_flag = 0;
4001 iosize = uiop->uio_resid;
4002 }
4003 dprintso(so, 1, ("sosend_svc: sending 0x%x %d, %ld bytes\n",
4004 prim, tdr.MORE_flag, iosize));
4005 mp = soallocproto1(&tdr, sizeof (tdr), 0, _ALLOC_INTR, CRED());
4006 if (mp == NULL) {
4007 /*
4008 * Caught a signal waiting for memory.
4009 * Let send* return EINTR.
4010 */
4011 return (EINTR);
4012 }
4013
4014 error = kstrputmsg(SOTOV(so), mp, uiop, iosize,
4015 0, sflag | MSG_BAND, 0);
4016 if (error) {
4017 eprintsoline(so, error);
4018 return (error);
4019 }
4020 if (uiop->uio_resid > 0) {
4021 /*
4022 * Recheck for fatal errors. Fail write even though
4023 * some data have been written. This is consistent
4024 * with strwrite semantics and BSD sockets semantics.
4025 */
4026 if (so->so_state & SS_CANTSENDMORE) {
4027 eprintsoline(so, error);
4028 return (EPIPE);
4029 }
4030 if (so->so_error != 0) {
4031 mutex_enter(&so->so_lock);
4032 error = sogeterr(so, B_TRUE);
4033 mutex_exit(&so->so_lock);
4034 if (error != 0) {
4035 eprintsoline(so, error);
4036 return (error);
4037 }
4038 }
4039 }
4040 } while (uiop->uio_resid > 0);
4041 return (0);
4042 }
4043
4044 /*
4045 * Check the state for errors and call the appropriate send function.
4046 *
4047 * If MSG_DONTROUTE is set (and SO_DONTROUTE isn't already set)
4048 * this function issues a setsockopt to toggle SO_DONTROUTE before and
4049 * after sending the message.
4050 *
4051 * The caller may optionally specify a destination address, for either
4052 * stream or datagram sockets. This table summarizes the cases:
4053 *
4054 * Socket type Dest. given Connected Result
4055 * ----------- ----------- --------- --------------
4056 * Stream * Yes send to conn. addr.
4057 * Stream * No error ENOTCONN
4058 * Dgram yes * send to given addr.
4059 * Dgram no yes send to conn. addr.
4060 * Dgram no no error EDESTADDRREQ
4061 *
4062 * There are subtleties around the destination address when using
4063 * AF_UNIX datagram sockets. When the sendmsg call specifies the
4064 * destination address, it's in (struct sockaddr_un) form and we
4065 * need to translate it to our internal form (struct so_ux_addr).
4066 *
4067 * When the sendmsg call does not specify a destination address
4068 * we're using the peer address saved during sotpi_connect, and
4069 * that address is already in internal form. In this case, the
4070 * (internal only) flag MSG_SENDTO_NOXLATE is set in the flags
4071 * passed to sosend_dgram or sosend_dgramcmsg to indicate that
4072 * those functions should skip translation to internal form.
4073 * Avoiding that translation is not only more efficient, but it's
4074 * also necessary when a process does a connect on an AF_UNIX
4075 * datagram socket and then drops privileges. After the process
4076 * has dropped privileges, it may no longer be able to lookup the
4077 * the external name in the filesystem, but it should still be
4078 * able to send messages on the connected socket by leaving the
4079 * destination name unspecified.
4080 *
4081 * Yet more subtleties arise with sockets connected by socketpair(),
4082 * which puts internal form addresses in the fields where normally
4083 * the external form is found, and sets sti_faddr_noxlate=1, which
4084 * (like flag MSG_SENDTO_NOXLATE) causes the sosend_dgram functions
4085 * to skip translation of destination addresses to internal form.
4086 * However, beware that the flag sti_faddr_noxlate=1 also triggers
4087 * different behaviour almost everywhere AF_UNIX addresses appear.
4088 */
4089 static int
sotpi_sendmsg(struct sonode * so,struct nmsghdr * msg,struct uio * uiop,struct cred * cr)4090 sotpi_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
4091 struct cred *cr)
4092 {
4093 int so_state;
4094 int so_mode;
4095 int error;
4096 struct sockaddr *name;
4097 t_uscalar_t namelen;
4098 int dontroute;
4099 int flags;
4100 sotpi_info_t *sti = SOTOTPI(so);
4101
4102 dprintso(so, 1, ("sotpi_sendmsg(%p, %p, 0x%x) state %s, error %d\n",
4103 (void *)so, (void *)msg, msg->msg_flags,
4104 pr_state(so->so_state, so->so_mode), so->so_error));
4105
4106 if (so->so_version == SOV_STREAM) {
4107 /* The imaginary "sockmod" has been popped - act as a stream */
4108 so_update_attrs(so, SOMOD);
4109 return (strwrite(SOTOV(so), uiop, cr));
4110 }
4111
4112 mutex_enter(&so->so_lock);
4113 so_state = so->so_state;
4114
4115 if (so_state & SS_CANTSENDMORE) {
4116 mutex_exit(&so->so_lock);
4117 return (EPIPE);
4118 }
4119
4120 if (so->so_error != 0) {
4121 error = sogeterr(so, B_TRUE);
4122 if (error != 0) {
4123 mutex_exit(&so->so_lock);
4124 return (error);
4125 }
4126 }
4127
4128 name = (struct sockaddr *)msg->msg_name;
4129 namelen = msg->msg_namelen;
4130 flags = msg->msg_flags;
4131
4132 /*
4133 * Historically, this function does not validate the flags
4134 * passed in, and any errant bits are ignored. However,
4135 * we would not want any such errant flag bits accidently
4136 * being treated as one of the internal-only flags, so
4137 * clear the internal-only flag bits.
4138 */
4139 flags &= ~MSG_SENDTO_NOXLATE;
4140
4141 so_mode = so->so_mode;
4142
4143 if (name == NULL) {
4144 if (!(so_state & SS_ISCONNECTED)) {
4145 mutex_exit(&so->so_lock);
4146 if (so_mode & SM_CONNREQUIRED)
4147 return (ENOTCONN);
4148 else
4149 return (EDESTADDRREQ);
4150 }
4151 /*
4152 * This is a connected socket.
4153 */
4154 if (so_mode & SM_CONNREQUIRED) {
4155 /*
4156 * This is a connected STREAM socket,
4157 * destination not specified.
4158 */
4159 name = NULL;
4160 namelen = 0;
4161 } else {
4162 /*
4163 * Datagram send on connected socket with
4164 * the destination name not specified.
4165 * Use the peer address from connect.
4166 */
4167 if (so->so_family == AF_UNIX) {
4168 /*
4169 * Use the (internal form) address saved
4170 * in sotpi_connect. See above.
4171 */
4172 name = (void *)&sti->sti_ux_faddr;
4173 namelen = sizeof (sti->sti_ux_faddr);
4174 flags |= MSG_SENDTO_NOXLATE;
4175 } else {
4176 ASSERT(sti->sti_faddr_sa);
4177 name = sti->sti_faddr_sa;
4178 namelen = (t_uscalar_t)sti->sti_faddr_len;
4179 }
4180 }
4181 } else {
4182 /*
4183 * Sendmsg specifies a destination name
4184 */
4185 if (!(so_state & SS_ISCONNECTED) &&
4186 (so_mode & SM_CONNREQUIRED)) {
4187 /* i.e. TCP not connected */
4188 mutex_exit(&so->so_lock);
4189 return (ENOTCONN);
4190 }
4191 /*
4192 * Ignore the address on connection-oriented sockets.
4193 * Just like BSD this code does not generate an error for
4194 * TCP (a CONNREQUIRED socket) when sending to an address
4195 * passed in with sendto/sendmsg. Instead the data is
4196 * delivered on the connection as if no address had been
4197 * supplied.
4198 */
4199 if ((so_state & SS_ISCONNECTED) &&
4200 !(so_mode & SM_CONNREQUIRED)) {
4201 mutex_exit(&so->so_lock);
4202 return (EISCONN);
4203 }
4204 if (!(so_state & SS_ISBOUND)) {
4205 so_lock_single(so); /* Set SOLOCKED */
4206 error = sotpi_bind(so, NULL, 0,
4207 _SOBIND_UNSPEC|_SOBIND_LOCK_HELD, cr);
4208 so_unlock_single(so, SOLOCKED);
4209 if (error) {
4210 mutex_exit(&so->so_lock);
4211 eprintsoline(so, error);
4212 return (error);
4213 }
4214 }
4215 /*
4216 * Handle delayed datagram errors. These are only queued
4217 * when the application sets SO_DGRAM_ERRIND.
4218 * Return the error if we are sending to the address
4219 * that was returned in the last T_UDERROR_IND.
4220 * If sending to some other address discard the delayed
4221 * error indication.
4222 */
4223 if (sti->sti_delayed_error) {
4224 struct T_uderror_ind *tudi;
4225 void *addr;
4226 t_uscalar_t addrlen;
4227 boolean_t match = B_FALSE;
4228
4229 ASSERT(sti->sti_eaddr_mp);
4230 error = sti->sti_delayed_error;
4231 sti->sti_delayed_error = 0;
4232 tudi =
4233 (struct T_uderror_ind *)sti->sti_eaddr_mp->b_rptr;
4234 addrlen = tudi->DEST_length;
4235 addr = sogetoff(sti->sti_eaddr_mp,
4236 tudi->DEST_offset, addrlen, 1);
4237 ASSERT(addr); /* Checked by strsock_proto */
4238 switch (so->so_family) {
4239 case AF_INET: {
4240 /* Compare just IP address and port */
4241 sin_t *sin1 = (sin_t *)name;
4242 sin_t *sin2 = (sin_t *)addr;
4243
4244 if (addrlen == sizeof (sin_t) &&
4245 namelen == addrlen &&
4246 sin1->sin_port == sin2->sin_port &&
4247 sin1->sin_addr.s_addr ==
4248 sin2->sin_addr.s_addr)
4249 match = B_TRUE;
4250 break;
4251 }
4252 case AF_INET6: {
4253 /* Compare just IP address and port. Not flow */
4254 sin6_t *sin1 = (sin6_t *)name;
4255 sin6_t *sin2 = (sin6_t *)addr;
4256
4257 if (addrlen == sizeof (sin6_t) &&
4258 namelen == addrlen &&
4259 sin1->sin6_port == sin2->sin6_port &&
4260 IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr,
4261 &sin2->sin6_addr))
4262 match = B_TRUE;
4263 break;
4264 }
4265 case AF_UNIX:
4266 default:
4267 if (namelen == addrlen &&
4268 bcmp(name, addr, namelen) == 0)
4269 match = B_TRUE;
4270 }
4271 if (match) {
4272 freemsg(sti->sti_eaddr_mp);
4273 sti->sti_eaddr_mp = NULL;
4274 mutex_exit(&so->so_lock);
4275 #ifdef DEBUG
4276 dprintso(so, 0,
4277 ("sockfs delayed error %d for %s\n",
4278 error,
4279 pr_addr(so->so_family, name, namelen)));
4280 #endif /* DEBUG */
4281 return (error);
4282 }
4283 freemsg(sti->sti_eaddr_mp);
4284 sti->sti_eaddr_mp = NULL;
4285 }
4286 }
4287 mutex_exit(&so->so_lock);
4288
4289 dontroute = 0;
4290 if ((flags & MSG_DONTROUTE) && !(so->so_options & SO_DONTROUTE)) {
4291 uint32_t val;
4292
4293 val = 1;
4294 error = sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4295 &val, (t_uscalar_t)sizeof (val), cr);
4296 if (error)
4297 return (error);
4298 dontroute = 1;
4299 }
4300
4301 if ((flags & MSG_OOB) && !(so_mode & SM_EXDATA)) {
4302 error = EOPNOTSUPP;
4303 goto done;
4304 }
4305 if (msg->msg_controllen != 0) {
4306 if (!(so_mode & SM_CONNREQUIRED)) {
4307 so_update_attrs(so, SOMOD);
4308 error = sosend_dgramcmsg(so, name, namelen, uiop,
4309 msg->msg_control, msg->msg_controllen, flags);
4310 } else {
4311 if (flags & MSG_OOB) {
4312 /* Can't generate T_EXDATA_REQ with options */
4313 error = EOPNOTSUPP;
4314 goto done;
4315 }
4316 so_update_attrs(so, SOMOD);
4317 error = sosend_svccmsg(so, uiop,
4318 !(flags & MSG_EOR),
4319 msg->msg_control, msg->msg_controllen,
4320 flags);
4321 }
4322 goto done;
4323 }
4324
4325 so_update_attrs(so, SOMOD);
4326 if (!(so_mode & SM_CONNREQUIRED)) {
4327 /*
4328 * If there is no SO_DONTROUTE to turn off return immediately
4329 * from send_dgram. This can allow tail-call optimizations.
4330 */
4331 if (!dontroute) {
4332 return (sosend_dgram(so, name, namelen, uiop, flags));
4333 }
4334 error = sosend_dgram(so, name, namelen, uiop, flags);
4335 } else {
4336 t_scalar_t prim;
4337 int sflag;
4338
4339 /* Ignore msg_name in the connected state */
4340 if (flags & MSG_OOB) {
4341 prim = T_EXDATA_REQ;
4342 /*
4343 * Send down T_EXDATA_REQ even if there is flow
4344 * control for data.
4345 */
4346 sflag = MSG_IGNFLOW;
4347 } else {
4348 if (so_mode & SM_BYTESTREAM) {
4349 /* Byte stream transport - use write */
4350 dprintso(so, 1, ("sotpi_sendmsg: write\n"));
4351
4352 /* Send M_DATA messages */
4353 /*
4354 * If there is no SO_DONTROUTE to turn off,
4355 * sti_direct is on, and there is no flow
4356 * control, we can take the fast path.
4357 */
4358 if (!dontroute && sti->sti_direct != 0 &&
4359 canputnext(SOTOV(so)->v_stream->sd_wrq)) {
4360 return (sostream_direct(so, uiop,
4361 NULL, cr));
4362 }
4363 error = strwrite(SOTOV(so), uiop, cr);
4364 goto done;
4365 }
4366 prim = T_DATA_REQ;
4367 sflag = 0;
4368 }
4369 /*
4370 * If there is no SO_DONTROUTE to turn off return immediately
4371 * from sosend_svc. This can allow tail-call optimizations.
4372 */
4373 if (!dontroute)
4374 return (sosend_svc(so, uiop, prim,
4375 !(flags & MSG_EOR), sflag));
4376 error = sosend_svc(so, uiop, prim,
4377 !(flags & MSG_EOR), sflag);
4378 }
4379 ASSERT(dontroute);
4380 done:
4381 if (dontroute) {
4382 uint32_t val;
4383
4384 val = 0;
4385 (void) sotpi_setsockopt(so, SOL_SOCKET, SO_DONTROUTE,
4386 &val, (t_uscalar_t)sizeof (val), cr);
4387 }
4388 return (error);
4389 }
4390
4391 /*
4392 * kstrwritemp() has very similar semantics as that of strwrite().
4393 * The main difference is it obtains mblks from the caller and also
4394 * does not do any copy as done in strwrite() from user buffers to
4395 * kernel buffers.
4396 *
4397 * Currently, this routine is used by sendfile to send data allocated
4398 * within the kernel without any copying. This interface does not use the
4399 * synchronous stream interface as synch. stream interface implies
4400 * copying.
4401 */
4402 int
kstrwritemp(struct vnode * vp,mblk_t * mp,ushort_t fmode)4403 kstrwritemp(struct vnode *vp, mblk_t *mp, ushort_t fmode)
4404 {
4405 struct stdata *stp;
4406 struct queue *wqp;
4407 mblk_t *newmp;
4408 char waitflag;
4409 int tempmode;
4410 int error = 0;
4411 int done = 0;
4412 struct sonode *so;
4413 boolean_t direct;
4414
4415 ASSERT(vp->v_stream);
4416 stp = vp->v_stream;
4417
4418 so = VTOSO(vp);
4419 direct = _SOTOTPI(so)->sti_direct;
4420
4421 /*
4422 * This is the sockfs direct fast path. canputnext() need
4423 * not be accurate so we don't grab the sd_lock here. If
4424 * we get flow-controlled, we grab sd_lock just before the
4425 * do..while loop below to emulate what strwrite() does.
4426 */
4427 wqp = stp->sd_wrq;
4428 if (canputnext(wqp) && direct &&
4429 !(stp->sd_flag & (STWRERR|STRHUP|STPLEX))) {
4430 return (sostream_direct(so, NULL, mp, CRED()));
4431 } else if (stp->sd_flag & (STWRERR|STRHUP|STPLEX)) {
4432 /* Fast check of flags before acquiring the lock */
4433 mutex_enter(&stp->sd_lock);
4434 error = strgeterr(stp, STWRERR|STRHUP|STPLEX, 0);
4435 mutex_exit(&stp->sd_lock);
4436 if (error != 0) {
4437 if (!(stp->sd_flag & STPLEX) &&
4438 (stp->sd_wput_opt & SW_SIGPIPE)) {
4439 error = EPIPE;
4440 }
4441 return (error);
4442 }
4443 }
4444
4445 waitflag = WRITEWAIT;
4446 if (stp->sd_flag & OLDNDELAY)
4447 tempmode = fmode & ~FNDELAY;
4448 else
4449 tempmode = fmode;
4450
4451 mutex_enter(&stp->sd_lock);
4452 do {
4453 if (canputnext(wqp)) {
4454 mutex_exit(&stp->sd_lock);
4455 if (stp->sd_wputdatafunc != NULL) {
4456 newmp = (stp->sd_wputdatafunc)(vp, mp, NULL,
4457 NULL, NULL, NULL);
4458 if (newmp == NULL) {
4459 /* The caller will free mp */
4460 return (ECOMM);
4461 }
4462 mp = newmp;
4463 }
4464 putnext(wqp, mp);
4465 return (0);
4466 }
4467 error = strwaitq(stp, waitflag, (ssize_t)0, tempmode, -1,
4468 &done);
4469 } while (error == 0 && !done);
4470
4471 mutex_exit(&stp->sd_lock);
4472 /*
4473 * EAGAIN tells the application to try again. ENOMEM
4474 * is returned only if the memory allocation size
4475 * exceeds the physical limits of the system. ENOMEM
4476 * can't be true here.
4477 */
4478 if (error == ENOMEM)
4479 error = EAGAIN;
4480 return (error);
4481 }
4482
4483 /* ARGSUSED */
4484 static int
sotpi_sendmblk(struct sonode * so,struct nmsghdr * msg,int fflag,struct cred * cr,mblk_t ** mpp)4485 sotpi_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
4486 struct cred *cr, mblk_t **mpp)
4487 {
4488 int error;
4489
4490 switch (so->so_family) {
4491 case AF_INET:
4492 case AF_INET6:
4493 case AF_UNIX:
4494 break;
4495 default:
4496 return (EAFNOSUPPORT);
4497
4498 }
4499
4500 if (so->so_state & SS_CANTSENDMORE)
4501 return (EPIPE);
4502
4503 if (so->so_type != SOCK_STREAM)
4504 return (EOPNOTSUPP);
4505
4506 if ((so->so_state & SS_ISCONNECTED) == 0)
4507 return (ENOTCONN);
4508
4509 error = kstrwritemp(so->so_vnode, *mpp, fflag);
4510 if (error == 0)
4511 *mpp = NULL;
4512 return (error);
4513 }
4514
4515 /*
4516 * Sending data on a datagram socket.
4517 * Assumes caller has verified that SS_ISBOUND etc. are set.
4518 */
4519 /* ARGSUSED */
4520 static int
sodgram_direct(struct sonode * so,struct sockaddr * name,socklen_t namelen,struct uio * uiop,int flags)4521 sodgram_direct(struct sonode *so, struct sockaddr *name,
4522 socklen_t namelen, struct uio *uiop, int flags)
4523 {
4524 struct T_unitdata_req tudr;
4525 mblk_t *mp = NULL;
4526 int error = 0;
4527 void *addr;
4528 socklen_t addrlen;
4529 ssize_t len;
4530 struct stdata *stp = SOTOV(so)->v_stream;
4531 int so_state;
4532 queue_t *udp_wq;
4533 boolean_t connected;
4534 mblk_t *mpdata = NULL;
4535 sotpi_info_t *sti = SOTOTPI(so);
4536 uint32_t auditing = AU_AUDITING();
4537
4538 ASSERT(name != NULL && namelen != 0);
4539 ASSERT(!(so->so_mode & SM_CONNREQUIRED));
4540 ASSERT(!(so->so_mode & SM_EXDATA));
4541 ASSERT(so->so_family == AF_INET || so->so_family == AF_INET6);
4542 ASSERT(SOTOV(so)->v_type == VSOCK);
4543
4544 /* Caller checked for proper length */
4545 len = uiop->uio_resid;
4546 ASSERT(len <= sti->sti_tidu_size);
4547
4548 /* Length and family checks have been done by caller */
4549 ASSERT(name->sa_family == so->so_family);
4550 ASSERT(so->so_family == AF_INET ||
4551 (namelen == (socklen_t)sizeof (struct sockaddr_in6)));
4552 ASSERT(so->so_family == AF_INET6 ||
4553 (namelen == (socklen_t)sizeof (struct sockaddr_in)));
4554
4555 addr = name;
4556 addrlen = namelen;
4557
4558 if (stp->sd_sidp != NULL &&
4559 (error = straccess(stp, JCWRITE)) != 0)
4560 goto done;
4561
4562 so_state = so->so_state;
4563
4564 connected = so_state & SS_ISCONNECTED;
4565 if (!connected) {
4566 tudr.PRIM_type = T_UNITDATA_REQ;
4567 tudr.DEST_length = addrlen;
4568 tudr.DEST_offset = (t_scalar_t)sizeof (tudr);
4569 tudr.OPT_length = 0;
4570 tudr.OPT_offset = 0;
4571
4572 mp = soallocproto2(&tudr, sizeof (tudr), addr, addrlen, 0,
4573 _ALLOC_INTR, CRED());
4574 if (mp == NULL) {
4575 /*
4576 * Caught a signal waiting for memory.
4577 * Let send* return EINTR.
4578 */
4579 error = EINTR;
4580 goto done;
4581 }
4582 }
4583
4584 /*
4585 * For UDP we don't break up the copyin into smaller pieces
4586 * as in the TCP case. That means if ENOMEM is returned by
4587 * mcopyinuio() then the uio vector has not been modified at
4588 * all and we fallback to either strwrite() or kstrputmsg()
4589 * below. Note also that we never generate priority messages
4590 * from here.
4591 */
4592 udp_wq = stp->sd_wrq->q_next;
4593 if (canput(udp_wq) &&
4594 (mpdata = mcopyinuio(stp, uiop, -1, -1, &error)) != NULL) {
4595 ASSERT(DB_TYPE(mpdata) == M_DATA);
4596 ASSERT(uiop->uio_resid == 0);
4597 if (!connected)
4598 linkb(mp, mpdata);
4599 else
4600 mp = mpdata;
4601 if (auditing)
4602 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4603
4604 /* Always returns 0... */
4605 return (udp_wput(udp_wq, mp));
4606 }
4607
4608 ASSERT(mpdata == NULL);
4609 if (error != 0 && error != ENOMEM) {
4610 freemsg(mp);
4611 return (error);
4612 }
4613
4614 /*
4615 * For connected, let strwrite() handle the blocking case.
4616 * Otherwise we fall thru and use kstrputmsg().
4617 */
4618 if (connected)
4619 return (strwrite(SOTOV(so), uiop, CRED()));
4620
4621 if (auditing)
4622 audit_sock(T_UNITDATA_REQ, strvp2wq(SOTOV(so)), mp, 0);
4623
4624 error = kstrputmsg(SOTOV(so), mp, uiop, len, 0, MSG_BAND, 0);
4625 done:
4626 #ifdef SOCK_DEBUG
4627 if (error != 0) {
4628 eprintsoline(so, error);
4629 }
4630 #endif /* SOCK_DEBUG */
4631 return (error);
4632 }
4633
4634 int
sostream_direct(struct sonode * so,struct uio * uiop,mblk_t * mp,cred_t * cr)4635 sostream_direct(struct sonode *so, struct uio *uiop, mblk_t *mp, cred_t *cr)
4636 {
4637 struct stdata *stp = SOTOV(so)->v_stream;
4638 ssize_t iosize, rmax, maxblk;
4639 queue_t *tcp_wq = stp->sd_wrq->q_next;
4640 mblk_t *newmp;
4641 int error = 0, wflag = 0;
4642
4643 ASSERT(so->so_mode & SM_BYTESTREAM);
4644 ASSERT(SOTOV(so)->v_type == VSOCK);
4645
4646 if (stp->sd_sidp != NULL &&
4647 (error = straccess(stp, JCWRITE)) != 0)
4648 return (error);
4649
4650 if (uiop == NULL) {
4651 /*
4652 * kstrwritemp() should have checked sd_flag and
4653 * flow-control before coming here. If we end up
4654 * here it means that we can simply pass down the
4655 * data to tcp.
4656 */
4657 ASSERT(mp != NULL);
4658 if (stp->sd_wputdatafunc != NULL) {
4659 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4660 NULL, NULL, NULL);
4661 if (newmp == NULL) {
4662 /* The caller will free mp */
4663 return (ECOMM);
4664 }
4665 mp = newmp;
4666 }
4667 /* Always returns 0... */
4668 return (tcp_wput(tcp_wq, mp));
4669 }
4670
4671 /* Fallback to strwrite() to do proper error handling */
4672 if (stp->sd_flag & (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))
4673 return (strwrite(SOTOV(so), uiop, cr));
4674
4675 rmax = stp->sd_qn_maxpsz;
4676 ASSERT(rmax >= 0 || rmax == INFPSZ);
4677 if (rmax == 0 || uiop->uio_resid <= 0)
4678 return (0);
4679
4680 if (rmax == INFPSZ)
4681 rmax = uiop->uio_resid;
4682
4683 maxblk = stp->sd_maxblk;
4684
4685 for (;;) {
4686 iosize = MIN(uiop->uio_resid, rmax);
4687
4688 mp = mcopyinuio(stp, uiop, iosize, maxblk, &error);
4689 if (mp == NULL) {
4690 /*
4691 * Fallback to strwrite() for ENOMEM; if this
4692 * is our first time in this routine and the uio
4693 * vector has not been modified, we will end up
4694 * calling strwrite() without any flag set.
4695 */
4696 if (error == ENOMEM)
4697 goto slow_send;
4698 else
4699 return (error);
4700 }
4701 ASSERT(uiop->uio_resid >= 0);
4702 /*
4703 * If mp is non-NULL and ENOMEM is set, it means that
4704 * mcopyinuio() was able to break down some of the user
4705 * data into one or more mblks. Send the partial data
4706 * to tcp and let the rest be handled in strwrite().
4707 */
4708 ASSERT(error == 0 || error == ENOMEM);
4709 if (stp->sd_wputdatafunc != NULL) {
4710 newmp = (stp->sd_wputdatafunc)(SOTOV(so), mp, NULL,
4711 NULL, NULL, NULL);
4712 if (newmp == NULL) {
4713 /* The caller will free mp */
4714 return (ECOMM);
4715 }
4716 mp = newmp;
4717 }
4718 (void) tcp_wput(tcp_wq, mp); /* Always returns 0 anyway. */
4719
4720 wflag |= NOINTR;
4721
4722 if (uiop->uio_resid == 0) { /* No more data; we're done */
4723 ASSERT(error == 0);
4724 break;
4725 } else if (error == ENOMEM || !canput(tcp_wq) || (stp->sd_flag &
4726 (STWRERR|STRHUP|STPLEX|STRDELIM|OLDNDELAY))) {
4727 slow_send:
4728 /*
4729 * We were able to send down partial data using
4730 * the direct call interface, but are now relying
4731 * on strwrite() to handle the non-fastpath cases.
4732 * If the socket is blocking we will sleep in
4733 * strwaitq() until write is permitted, otherwise,
4734 * we will need to return the amount of bytes
4735 * written so far back to the app. This is the
4736 * reason why we pass NOINTR flag to strwrite()
4737 * for non-blocking socket, because we don't want
4738 * to return EAGAIN when portion of the user data
4739 * has actually been sent down.
4740 */
4741 return (strwrite_common(SOTOV(so), uiop, cr, wflag));
4742 }
4743 }
4744 return (0);
4745 }
4746
4747 /*
4748 * Update sti_faddr by asking the transport (unless AF_UNIX).
4749 */
4750 /* ARGSUSED */
4751 int
sotpi_getpeername(struct sonode * so,struct sockaddr * name,socklen_t * namelen,boolean_t accept,struct cred * cr)4752 sotpi_getpeername(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4753 boolean_t accept, struct cred *cr)
4754 {
4755 struct strbuf strbuf;
4756 int error = 0, res;
4757 void *addr;
4758 t_uscalar_t addrlen;
4759 k_sigset_t smask;
4760 sotpi_info_t *sti = SOTOTPI(so);
4761
4762 dprintso(so, 1, ("sotpi_getpeername(%p) %s\n",
4763 (void *)so, pr_state(so->so_state, so->so_mode)));
4764
4765 ASSERT(*namelen > 0);
4766 mutex_enter(&so->so_lock);
4767 so_lock_single(so); /* Set SOLOCKED */
4768
4769 if (accept) {
4770 bcopy(sti->sti_faddr_sa, name,
4771 MIN(*namelen, sti->sti_faddr_len));
4772 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4773 goto done;
4774 }
4775
4776 if (!(so->so_state & SS_ISCONNECTED)) {
4777 error = ENOTCONN;
4778 goto done;
4779 }
4780 /* Added this check for X/Open */
4781 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
4782 error = EINVAL;
4783 if (xnet_check_print) {
4784 printf("sockfs: X/Open getpeername check => EINVAL\n");
4785 }
4786 goto done;
4787 }
4788
4789 if (sti->sti_faddr_valid) {
4790 bcopy(sti->sti_faddr_sa, name,
4791 MIN(*namelen, sti->sti_faddr_len));
4792 *namelen = sti->sti_faddr_noxlate ? 0: sti->sti_faddr_len;
4793 goto done;
4794 }
4795
4796 #ifdef DEBUG
4797 dprintso(so, 1, ("sotpi_getpeername (local): %s\n",
4798 pr_addr(so->so_family, sti->sti_faddr_sa,
4799 (t_uscalar_t)sti->sti_faddr_len)));
4800 #endif /* DEBUG */
4801
4802 if (so->so_family == AF_UNIX) {
4803 /* Transport has different name space - return local info */
4804 if (sti->sti_faddr_noxlate)
4805 *namelen = 0;
4806 error = 0;
4807 goto done;
4808 }
4809
4810 ASSERT(so->so_family != AF_UNIX && sti->sti_faddr_noxlate == 0);
4811
4812 ASSERT(sti->sti_faddr_sa);
4813 /* Allocate local buffer to use with ioctl */
4814 addrlen = (t_uscalar_t)sti->sti_faddr_maxlen;
4815 mutex_exit(&so->so_lock);
4816 addr = kmem_alloc(addrlen, KM_SLEEP);
4817
4818 /*
4819 * Issue TI_GETPEERNAME with signals masked.
4820 * Put the result in sti_faddr_sa so that getpeername works after
4821 * a shutdown(output).
4822 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4823 * back to the socket.
4824 */
4825 strbuf.buf = addr;
4826 strbuf.maxlen = addrlen;
4827 strbuf.len = 0;
4828
4829 sigintr(&smask, 0);
4830 res = 0;
4831 ASSERT(cr);
4832 error = strioctl(SOTOV(so), TI_GETPEERNAME, (intptr_t)&strbuf,
4833 0, K_TO_K, cr, &res);
4834 sigunintr(&smask);
4835
4836 mutex_enter(&so->so_lock);
4837 /*
4838 * If there is an error record the error in so_error put don't fail
4839 * the getpeername. Instead fallback on the recorded
4840 * sti->sti_faddr_sa.
4841 */
4842 if (error) {
4843 /*
4844 * Various stream head errors can be returned to the ioctl.
4845 * However, it is impossible to determine which ones of
4846 * these are really socket level errors that were incorrectly
4847 * consumed by the ioctl. Thus this code silently ignores the
4848 * error - to code explicitly does not reinstate the error
4849 * using soseterror().
4850 * Experiments have shows that at least this set of
4851 * errors are reported and should not be reinstated on the
4852 * socket:
4853 * EINVAL E.g. if an I_LINK was in effect when
4854 * getpeername was called.
4855 * EPIPE The ioctl error semantics prefer the write
4856 * side error over the read side error.
4857 * ENOTCONN The transport just got disconnected but
4858 * sockfs had not yet seen the T_DISCON_IND
4859 * when issuing the ioctl.
4860 */
4861 error = 0;
4862 } else if (res == 0 && strbuf.len > 0 &&
4863 (so->so_state & SS_ISCONNECTED)) {
4864 ASSERT(strbuf.len <= (int)sti->sti_faddr_maxlen);
4865 sti->sti_faddr_len = (socklen_t)strbuf.len;
4866 bcopy(addr, sti->sti_faddr_sa, sti->sti_faddr_len);
4867 sti->sti_faddr_valid = 1;
4868
4869 bcopy(addr, name, MIN(*namelen, sti->sti_faddr_len));
4870 *namelen = sti->sti_faddr_len;
4871 }
4872 kmem_free(addr, addrlen);
4873 #ifdef DEBUG
4874 dprintso(so, 1, ("sotpi_getpeername (tp): %s\n",
4875 pr_addr(so->so_family, sti->sti_faddr_sa,
4876 (t_uscalar_t)sti->sti_faddr_len)));
4877 #endif /* DEBUG */
4878 done:
4879 so_unlock_single(so, SOLOCKED);
4880 mutex_exit(&so->so_lock);
4881 return (error);
4882 }
4883
4884 /*
4885 * Update sti_laddr by asking the transport (unless AF_UNIX).
4886 */
4887 int
sotpi_getsockname(struct sonode * so,struct sockaddr * name,socklen_t * namelen,struct cred * cr)4888 sotpi_getsockname(struct sonode *so, struct sockaddr *name, socklen_t *namelen,
4889 struct cred *cr)
4890 {
4891 struct strbuf strbuf;
4892 int error = 0, res;
4893 void *addr;
4894 t_uscalar_t addrlen;
4895 k_sigset_t smask;
4896 sotpi_info_t *sti = SOTOTPI(so);
4897
4898 dprintso(so, 1, ("sotpi_getsockname(%p) %s\n",
4899 (void *)so, pr_state(so->so_state, so->so_mode)));
4900
4901 ASSERT(*namelen > 0);
4902 mutex_enter(&so->so_lock);
4903 so_lock_single(so); /* Set SOLOCKED */
4904
4905 #ifdef DEBUG
4906
4907 dprintso(so, 1, ("sotpi_getsockname (local): %s\n",
4908 pr_addr(so->so_family, sti->sti_laddr_sa,
4909 (t_uscalar_t)sti->sti_laddr_len)));
4910 #endif /* DEBUG */
4911 if (sti->sti_laddr_valid) {
4912 bcopy(sti->sti_laddr_sa, name,
4913 MIN(*namelen, sti->sti_laddr_len));
4914 *namelen = sti->sti_laddr_len;
4915 goto done;
4916 }
4917
4918 if (so->so_family == AF_UNIX) {
4919 /*
4920 * Transport has different name space - return local info. If we
4921 * have enough space, let consumers know the family.
4922 */
4923 if (*namelen >= sizeof (sa_family_t)) {
4924 name->sa_family = AF_UNIX;
4925 *namelen = sizeof (sa_family_t);
4926 } else {
4927 *namelen = 0;
4928 }
4929 error = 0;
4930 goto done;
4931 }
4932 if (!(so->so_state & SS_ISBOUND)) {
4933 /* If not bound, then nothing to return. */
4934 error = 0;
4935 goto done;
4936 }
4937
4938 /* Allocate local buffer to use with ioctl */
4939 addrlen = (t_uscalar_t)sti->sti_laddr_maxlen;
4940 mutex_exit(&so->so_lock);
4941 addr = kmem_alloc(addrlen, KM_SLEEP);
4942
4943 /*
4944 * Issue TI_GETMYNAME with signals masked.
4945 * Put the result in sti_laddr_sa so that getsockname works after
4946 * a shutdown(output).
4947 * If the ioctl fails (e.g. due to a ECONNRESET) the error is reposted
4948 * back to the socket.
4949 */
4950 strbuf.buf = addr;
4951 strbuf.maxlen = addrlen;
4952 strbuf.len = 0;
4953
4954 sigintr(&smask, 0);
4955 res = 0;
4956 ASSERT(cr);
4957 error = strioctl(SOTOV(so), TI_GETMYNAME, (intptr_t)&strbuf,
4958 0, K_TO_K, cr, &res);
4959 sigunintr(&smask);
4960
4961 mutex_enter(&so->so_lock);
4962 /*
4963 * If there is an error record the error in so_error put don't fail
4964 * the getsockname. Instead fallback on the recorded
4965 * sti->sti_laddr_sa.
4966 */
4967 if (error) {
4968 /*
4969 * Various stream head errors can be returned to the ioctl.
4970 * However, it is impossible to determine which ones of
4971 * these are really socket level errors that were incorrectly
4972 * consumed by the ioctl. Thus this code silently ignores the
4973 * error - to code explicitly does not reinstate the error
4974 * using soseterror().
4975 * Experiments have shows that at least this set of
4976 * errors are reported and should not be reinstated on the
4977 * socket:
4978 * EINVAL E.g. if an I_LINK was in effect when
4979 * getsockname was called.
4980 * EPIPE The ioctl error semantics prefer the write
4981 * side error over the read side error.
4982 */
4983 error = 0;
4984 } else if (res == 0 && strbuf.len > 0 &&
4985 (so->so_state & SS_ISBOUND)) {
4986 ASSERT(strbuf.len <= (int)sti->sti_laddr_maxlen);
4987 sti->sti_laddr_len = (socklen_t)strbuf.len;
4988 bcopy(addr, sti->sti_laddr_sa, sti->sti_laddr_len);
4989 sti->sti_laddr_valid = 1;
4990
4991 bcopy(addr, name, MIN(sti->sti_laddr_len, *namelen));
4992 *namelen = sti->sti_laddr_len;
4993 }
4994 kmem_free(addr, addrlen);
4995 #ifdef DEBUG
4996 dprintso(so, 1, ("sotpi_getsockname (tp): %s\n",
4997 pr_addr(so->so_family, sti->sti_laddr_sa,
4998 (t_uscalar_t)sti->sti_laddr_len)));
4999 #endif /* DEBUG */
5000 done:
5001 so_unlock_single(so, SOLOCKED);
5002 mutex_exit(&so->so_lock);
5003 return (error);
5004 }
5005
5006 /*
5007 * Get socket options. For SOL_SOCKET options some options are handled
5008 * by the sockfs while others use the value recorded in the sonode as a
5009 * fallback should the T_SVR4_OPTMGMT_REQ fail.
5010 *
5011 * On the return most *optlenp bytes are copied to optval.
5012 */
5013 /* ARGSUSED */
5014 int
sotpi_getsockopt(struct sonode * so,int level,int option_name,void * optval,socklen_t * optlenp,int flags,struct cred * cr)5015 sotpi_getsockopt(struct sonode *so, int level, int option_name,
5016 void *optval, socklen_t *optlenp, int flags, struct cred *cr)
5017 {
5018 struct T_optmgmt_req optmgmt_req;
5019 struct T_optmgmt_ack *optmgmt_ack;
5020 struct opthdr oh;
5021 struct opthdr *opt_res;
5022 mblk_t *mp = NULL;
5023 int error = 0;
5024 void *option = NULL; /* Set if fallback value */
5025 t_uscalar_t maxlen = *optlenp;
5026 t_uscalar_t len;
5027 uint32_t value;
5028 struct timeval tmo_val; /* used for SO_RCVTIMEO, SO_SNDTIMEO */
5029 struct timeval32 tmo_val32;
5030 struct so_snd_bufinfo snd_bufinfo; /* used for zero copy */
5031
5032 dprintso(so, 1, ("sotpi_getsockopt(%p, 0x%x, 0x%x, %p, %p) %s\n",
5033 (void *)so, level, option_name, optval, (void *)optlenp,
5034 pr_state(so->so_state, so->so_mode)));
5035
5036 mutex_enter(&so->so_lock);
5037 so_lock_single(so); /* Set SOLOCKED */
5038
5039 len = (t_uscalar_t)sizeof (uint32_t); /* Default */
5040
5041 /*
5042 * Check for SOL_SOCKET options.
5043 * Certain SOL_SOCKET options are returned directly whereas
5044 * others only provide a default (fallback) value should
5045 * the T_SVR4_OPTMGMT_REQ fail.
5046 */
5047 if (level == SOL_SOCKET) {
5048 /* Check parameters */
5049 switch (option_name) {
5050 case SO_TYPE:
5051 case SO_ERROR:
5052 case SO_DEBUG:
5053 case SO_ACCEPTCONN:
5054 case SO_REUSEADDR:
5055 case SO_KEEPALIVE:
5056 case SO_DONTROUTE:
5057 case SO_BROADCAST:
5058 case SO_USELOOPBACK:
5059 case SO_OOBINLINE:
5060 case SO_SNDBUF:
5061 case SO_RCVBUF:
5062 #ifdef notyet
5063 case SO_SNDLOWAT:
5064 case SO_RCVLOWAT:
5065 #endif /* notyet */
5066 case SO_DOMAIN:
5067 case SO_DGRAM_ERRIND:
5068 case SO_PROTOCOL:
5069 if (maxlen < (t_uscalar_t)sizeof (int32_t)) {
5070 error = EINVAL;
5071 eprintsoline(so, error);
5072 goto done2;
5073 }
5074 break;
5075 case SO_RCVTIMEO:
5076 case SO_SNDTIMEO:
5077 if (get_udatamodel() == DATAMODEL_NONE ||
5078 get_udatamodel() == DATAMODEL_NATIVE) {
5079 if (maxlen < sizeof (struct timeval)) {
5080 error = EINVAL;
5081 eprintsoline(so, error);
5082 goto done2;
5083 }
5084 } else {
5085 if (maxlen < sizeof (struct timeval32)) {
5086 error = EINVAL;
5087 eprintsoline(so, error);
5088 goto done2;
5089 }
5090
5091 }
5092 break;
5093 case SO_LINGER:
5094 if (maxlen < (t_uscalar_t)sizeof (struct linger)) {
5095 error = EINVAL;
5096 eprintsoline(so, error);
5097 goto done2;
5098 }
5099 break;
5100 case SO_SND_BUFINFO:
5101 if (maxlen < (t_uscalar_t)
5102 sizeof (struct so_snd_bufinfo)) {
5103 error = EINVAL;
5104 eprintsoline(so, error);
5105 goto done2;
5106 }
5107 break;
5108 }
5109
5110 switch (option_name) {
5111 case SO_TYPE:
5112 value = so->so_type;
5113 option = &value;
5114 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5115
5116 case SO_ERROR:
5117 value = sogeterr(so, B_TRUE);
5118 option = &value;
5119 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5120
5121 case SO_ACCEPTCONN:
5122 if (so->so_state & SS_ACCEPTCONN)
5123 value = SO_ACCEPTCONN;
5124 else
5125 value = 0;
5126 #ifdef DEBUG
5127 if (value) {
5128 dprintso(so, 1,
5129 ("sotpi_getsockopt: 0x%x is set\n",
5130 option_name));
5131 } else {
5132 dprintso(so, 1,
5133 ("sotpi_getsockopt: 0x%x not set\n",
5134 option_name));
5135 }
5136 #endif /* DEBUG */
5137 option = &value;
5138 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5139
5140 case SO_DEBUG:
5141 case SO_REUSEADDR:
5142 case SO_KEEPALIVE:
5143 case SO_DONTROUTE:
5144 case SO_BROADCAST:
5145 case SO_USELOOPBACK:
5146 case SO_OOBINLINE:
5147 case SO_DGRAM_ERRIND:
5148 value = (so->so_options & option_name);
5149 #ifdef DEBUG
5150 if (value) {
5151 dprintso(so, 1,
5152 ("sotpi_getsockopt: 0x%x is set\n",
5153 option_name));
5154 } else {
5155 dprintso(so, 1,
5156 ("sotpi_getsockopt: 0x%x not set\n",
5157 option_name));
5158 }
5159 #endif /* DEBUG */
5160 option = &value;
5161 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5162
5163 /*
5164 * The following options are only returned by sockfs when the
5165 * T_SVR4_OPTMGMT_REQ fails.
5166 */
5167 case SO_LINGER:
5168 option = &so->so_linger;
5169 len = (t_uscalar_t)sizeof (struct linger);
5170 break;
5171 case SO_SNDBUF: {
5172 ssize_t lvalue;
5173
5174 /*
5175 * If the option has not been set then get a default
5176 * value from the read queue. This value is
5177 * returned if the transport fails
5178 * the T_SVR4_OPTMGMT_REQ.
5179 */
5180 lvalue = so->so_sndbuf;
5181 if (lvalue == 0) {
5182 mutex_exit(&so->so_lock);
5183 (void) strqget(strvp2wq(SOTOV(so))->q_next,
5184 QHIWAT, 0, &lvalue);
5185 mutex_enter(&so->so_lock);
5186 dprintso(so, 1,
5187 ("got SO_SNDBUF %ld from q\n", lvalue));
5188 }
5189 value = (int)lvalue;
5190 option = &value;
5191 len = (t_uscalar_t)sizeof (so->so_sndbuf);
5192 break;
5193 }
5194 case SO_RCVBUF: {
5195 ssize_t lvalue;
5196
5197 /*
5198 * If the option has not been set then get a default
5199 * value from the read queue. This value is
5200 * returned if the transport fails
5201 * the T_SVR4_OPTMGMT_REQ.
5202 *
5203 * XXX If SO_RCVBUF has been set and this is an
5204 * XPG 4.2 application then do not ask the transport
5205 * since the transport might adjust the value and not
5206 * return exactly what was set by the application.
5207 * For non-XPG 4.2 application we return the value
5208 * that the transport is actually using.
5209 */
5210 lvalue = so->so_rcvbuf;
5211 if (lvalue == 0) {
5212 mutex_exit(&so->so_lock);
5213 (void) strqget(RD(strvp2wq(SOTOV(so))),
5214 QHIWAT, 0, &lvalue);
5215 mutex_enter(&so->so_lock);
5216 dprintso(so, 1,
5217 ("got SO_RCVBUF %ld from q\n", lvalue));
5218 } else if (flags & _SOGETSOCKOPT_XPG4_2) {
5219 value = (int)lvalue;
5220 option = &value;
5221 goto copyout; /* skip asking transport */
5222 }
5223 value = (int)lvalue;
5224 option = &value;
5225 len = (t_uscalar_t)sizeof (so->so_rcvbuf);
5226 break;
5227 }
5228 case SO_DOMAIN:
5229 value = so->so_family;
5230 option = &value;
5231 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5232
5233 case SO_PROTOCOL:
5234 value = so->so_protocol;
5235 option = &value;
5236 goto copyout; /* No need to issue T_SVR4_OPTMGMT_REQ */
5237
5238 #ifdef notyet
5239 /*
5240 * We do not implement the semantics of these options
5241 * thus we shouldn't implement the options either.
5242 */
5243 case SO_SNDLOWAT:
5244 value = so->so_sndlowat;
5245 option = &value;
5246 break;
5247 case SO_RCVLOWAT:
5248 value = so->so_rcvlowat;
5249 option = &value;
5250 break;
5251 #endif /* notyet */
5252 case SO_SNDTIMEO:
5253 case SO_RCVTIMEO: {
5254 clock_t val;
5255
5256 if (option_name == SO_RCVTIMEO)
5257 val = drv_hztousec(so->so_rcvtimeo);
5258 else
5259 val = drv_hztousec(so->so_sndtimeo);
5260 tmo_val.tv_sec = val / (1000 * 1000);
5261 tmo_val.tv_usec = val % (1000 * 1000);
5262 if (get_udatamodel() == DATAMODEL_NONE ||
5263 get_udatamodel() == DATAMODEL_NATIVE) {
5264 option = &tmo_val;
5265 len = sizeof (struct timeval);
5266 } else {
5267 TIMEVAL_TO_TIMEVAL32(&tmo_val32, &tmo_val);
5268 option = &tmo_val32;
5269 len = sizeof (struct timeval32);
5270 }
5271 break;
5272 }
5273 case SO_SND_BUFINFO: {
5274 snd_bufinfo.sbi_wroff =
5275 (so->so_proto_props).sopp_wroff;
5276 snd_bufinfo.sbi_maxblk =
5277 (so->so_proto_props).sopp_maxblk;
5278 snd_bufinfo.sbi_maxpsz =
5279 (so->so_proto_props).sopp_maxpsz;
5280 snd_bufinfo.sbi_tail =
5281 (so->so_proto_props).sopp_tail;
5282 option = &snd_bufinfo;
5283 len = (t_uscalar_t)sizeof (struct so_snd_bufinfo);
5284 break;
5285 }
5286 }
5287 }
5288
5289 mutex_exit(&so->so_lock);
5290
5291 /* Send request */
5292 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5293 optmgmt_req.MGMT_flags = T_CHECK;
5294 optmgmt_req.OPT_length = (t_scalar_t)(sizeof (oh) + maxlen);
5295 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5296
5297 oh.level = level;
5298 oh.name = option_name;
5299 oh.len = maxlen;
5300
5301 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5302 &oh, sizeof (oh), NULL, maxlen, 0, _ALLOC_SLEEP, cr);
5303 /* Let option management work in the presence of data flow control */
5304 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5305 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5306 mp = NULL;
5307 mutex_enter(&so->so_lock);
5308 if (error) {
5309 eprintsoline(so, error);
5310 goto done2;
5311 }
5312 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5313 (t_uscalar_t)(sizeof (*optmgmt_ack) + sizeof (*opt_res)), &mp, 0);
5314 if (error) {
5315 if (option != NULL) {
5316 /* We have a fallback value */
5317 error = 0;
5318 goto copyout;
5319 }
5320 eprintsoline(so, error);
5321 goto done2;
5322 }
5323 ASSERT(mp);
5324 optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
5325 opt_res = (struct opthdr *)sogetoff(mp, optmgmt_ack->OPT_offset,
5326 optmgmt_ack->OPT_length, __TPI_ALIGN_SIZE);
5327 if (opt_res == NULL) {
5328 if (option != NULL) {
5329 /* We have a fallback value */
5330 error = 0;
5331 goto copyout;
5332 }
5333 error = EPROTO;
5334 eprintsoline(so, error);
5335 goto done;
5336 }
5337 option = &opt_res[1];
5338
5339 /* check to ensure that the option is within bounds */
5340 if (((uintptr_t)option + opt_res->len < (uintptr_t)option) ||
5341 (uintptr_t)option + opt_res->len > (uintptr_t)mp->b_wptr) {
5342 if (option != NULL) {
5343 /* We have a fallback value */
5344 error = 0;
5345 goto copyout;
5346 }
5347 error = EPROTO;
5348 eprintsoline(so, error);
5349 goto done;
5350 }
5351
5352 len = opt_res->len;
5353
5354 copyout: {
5355 t_uscalar_t size = MIN(len, maxlen);
5356 bcopy(option, optval, size);
5357 bcopy(&size, optlenp, sizeof (size));
5358 }
5359 done:
5360 freemsg(mp);
5361 done2:
5362 so_unlock_single(so, SOLOCKED);
5363 mutex_exit(&so->so_lock);
5364
5365 return (error);
5366 }
5367
5368 /*
5369 * Set socket options. All options are passed down in a T_SVR4_OPTMGMT_REQ.
5370 * SOL_SOCKET options are also recorded in the sonode. A setsockopt for
5371 * SOL_SOCKET options will not fail just because the T_SVR4_OPTMGMT_REQ fails -
5372 * setsockopt has to work even if the transport does not support the option.
5373 */
5374 /* ARGSUSED */
5375 int
sotpi_setsockopt(struct sonode * so,int level,int option_name,const void * optval,t_uscalar_t optlen,struct cred * cr)5376 sotpi_setsockopt(struct sonode *so, int level, int option_name,
5377 const void *optval, t_uscalar_t optlen, struct cred *cr)
5378 {
5379 struct T_optmgmt_req optmgmt_req;
5380 struct opthdr oh;
5381 mblk_t *mp;
5382 int error = 0;
5383 boolean_t handled = B_FALSE;
5384
5385 dprintso(so, 1, ("sotpi_setsockopt(%p, 0x%x, 0x%x, %p, %d) %s\n",
5386 (void *)so, level, option_name, optval, optlen,
5387 pr_state(so->so_state, so->so_mode)));
5388
5389 /* X/Open requires this check */
5390 if ((so->so_state & SS_CANTSENDMORE) && !xnet_skip_checks) {
5391 if (xnet_check_print)
5392 printf("sockfs: X/Open setsockopt check => EINVAL\n");
5393 return (EINVAL);
5394 }
5395
5396 mutex_enter(&so->so_lock);
5397 so_lock_single(so); /* Set SOLOCKED */
5398 mutex_exit(&so->so_lock);
5399
5400 optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ;
5401 optmgmt_req.MGMT_flags = T_NEGOTIATE;
5402 optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen;
5403 optmgmt_req.OPT_offset = (t_scalar_t)sizeof (optmgmt_req);
5404
5405 oh.level = level;
5406 oh.name = option_name;
5407 oh.len = optlen;
5408
5409 mp = soallocproto3(&optmgmt_req, sizeof (optmgmt_req),
5410 &oh, sizeof (oh), optval, optlen, 0, _ALLOC_SLEEP, cr);
5411 /* Let option management work in the presence of data flow control */
5412 error = kstrputmsg(SOTOV(so), mp, NULL, 0, 0,
5413 MSG_BAND|MSG_HOLDSIG|MSG_IGNERROR|MSG_IGNFLOW, 0);
5414 mp = NULL;
5415 mutex_enter(&so->so_lock);
5416 if (error) {
5417 eprintsoline(so, error);
5418 goto done2;
5419 }
5420 error = sowaitprim(so, T_SVR4_OPTMGMT_REQ, T_OPTMGMT_ACK,
5421 (t_uscalar_t)sizeof (struct T_optmgmt_ack), &mp, 0);
5422 if (error) {
5423 eprintsoline(so, error);
5424 goto done;
5425 }
5426 ASSERT(mp);
5427 /* No need to verify T_optmgmt_ack */
5428 freemsg(mp);
5429 done:
5430 /*
5431 * Check for SOL_SOCKET options and record their values.
5432 * If we know about a SOL_SOCKET parameter and the transport
5433 * failed it with TBADOPT or TOUTSTATE (i.e. ENOPROTOOPT or
5434 * EPROTO) we let the setsockopt succeed.
5435 */
5436 if (level == SOL_SOCKET) {
5437 /* Check parameters */
5438 switch (option_name) {
5439 case SO_DEBUG:
5440 case SO_REUSEADDR:
5441 case SO_KEEPALIVE:
5442 case SO_DONTROUTE:
5443 case SO_BROADCAST:
5444 case SO_USELOOPBACK:
5445 case SO_OOBINLINE:
5446 case SO_SNDBUF:
5447 case SO_RCVBUF:
5448 #ifdef notyet
5449 case SO_SNDLOWAT:
5450 case SO_RCVLOWAT:
5451 #endif /* notyet */
5452 case SO_DGRAM_ERRIND:
5453 if (optlen != (t_uscalar_t)sizeof (int32_t)) {
5454 error = EINVAL;
5455 eprintsoline(so, error);
5456 goto done2;
5457 }
5458 ASSERT(optval);
5459 handled = B_TRUE;
5460 break;
5461 case SO_SNDTIMEO:
5462 case SO_RCVTIMEO:
5463 if (get_udatamodel() == DATAMODEL_NONE ||
5464 get_udatamodel() == DATAMODEL_NATIVE) {
5465 if (optlen != sizeof (struct timeval)) {
5466 error = EINVAL;
5467 eprintsoline(so, error);
5468 goto done2;
5469 }
5470 } else {
5471 if (optlen != sizeof (struct timeval32)) {
5472 error = EINVAL;
5473 eprintsoline(so, error);
5474 goto done2;
5475 }
5476 }
5477 ASSERT(optval);
5478 handled = B_TRUE;
5479 break;
5480 case SO_LINGER:
5481 if (optlen != (t_uscalar_t)sizeof (struct linger)) {
5482 error = EINVAL;
5483 eprintsoline(so, error);
5484 goto done2;
5485 }
5486 ASSERT(optval);
5487 handled = B_TRUE;
5488 break;
5489 }
5490
5491 #define intvalue (*(int32_t *)optval)
5492
5493 switch (option_name) {
5494 case SO_TYPE:
5495 case SO_ERROR:
5496 case SO_ACCEPTCONN:
5497 /* Can't be set */
5498 error = ENOPROTOOPT;
5499 goto done2;
5500 case SO_LINGER: {
5501 struct linger *l = (struct linger *)optval;
5502
5503 so->so_linger.l_linger = l->l_linger;
5504 if (l->l_onoff) {
5505 so->so_linger.l_onoff = SO_LINGER;
5506 so->so_options |= SO_LINGER;
5507 } else {
5508 so->so_linger.l_onoff = 0;
5509 so->so_options &= ~SO_LINGER;
5510 }
5511 break;
5512 }
5513
5514 case SO_DEBUG:
5515 #ifdef SOCK_TEST
5516 if (intvalue & 2)
5517 sock_test_timelimit = 10 * hz;
5518 else
5519 sock_test_timelimit = 0;
5520
5521 if (intvalue & 4)
5522 do_useracc = 0;
5523 else
5524 do_useracc = 1;
5525 #endif /* SOCK_TEST */
5526 /* FALLTHRU */
5527 case SO_REUSEADDR:
5528 case SO_KEEPALIVE:
5529 case SO_DONTROUTE:
5530 case SO_BROADCAST:
5531 case SO_USELOOPBACK:
5532 case SO_OOBINLINE:
5533 case SO_DGRAM_ERRIND:
5534 if (intvalue != 0) {
5535 dprintso(so, 1,
5536 ("socket_setsockopt: setting 0x%x\n",
5537 option_name));
5538 so->so_options |= option_name;
5539 } else {
5540 dprintso(so, 1,
5541 ("socket_setsockopt: clearing 0x%x\n",
5542 option_name));
5543 so->so_options &= ~option_name;
5544 }
5545 break;
5546 /*
5547 * The following options are only returned by us when the
5548 * transport layer fails.
5549 * XXX XPG 4.2 applications retrieve SO_RCVBUF from sockfs
5550 * since the transport might adjust the value and not
5551 * return exactly what was set by the application.
5552 */
5553 case SO_SNDBUF:
5554 so->so_sndbuf = intvalue;
5555 break;
5556 case SO_RCVBUF:
5557 so->so_rcvbuf = intvalue;
5558 break;
5559 case SO_RCVPSH:
5560 so->so_rcv_timer_interval = intvalue;
5561 break;
5562 #ifdef notyet
5563 /*
5564 * We do not implement the semantics of these options
5565 * thus we shouldn't implement the options either.
5566 */
5567 case SO_SNDLOWAT:
5568 so->so_sndlowat = intvalue;
5569 break;
5570 case SO_RCVLOWAT:
5571 so->so_rcvlowat = intvalue;
5572 break;
5573 #endif /* notyet */
5574 case SO_SNDTIMEO:
5575 case SO_RCVTIMEO: {
5576 struct timeval tl;
5577 clock_t val;
5578
5579 if (get_udatamodel() == DATAMODEL_NONE ||
5580 get_udatamodel() == DATAMODEL_NATIVE) {
5581 bcopy((struct timeval *)optval, &tl,
5582 sizeof (struct timeval));
5583 } else {
5584 TIMEVAL32_TO_TIMEVAL(&tl,
5585 (struct timeval32 *)optval);
5586 }
5587 val = tl.tv_sec * 1000 * 1000 + tl.tv_usec;
5588 if (option_name == SO_RCVTIMEO)
5589 so->so_rcvtimeo = drv_usectohz(val);
5590 else
5591 so->so_sndtimeo = drv_usectohz(val);
5592 break;
5593 }
5594 }
5595 #undef intvalue
5596
5597 if (error) {
5598 if ((error == ENOPROTOOPT || error == EPROTO ||
5599 error == EINVAL) && handled) {
5600 dprintso(so, 1,
5601 ("setsockopt: ignoring error %d for 0x%x\n",
5602 error, option_name));
5603 error = 0;
5604 }
5605 }
5606 }
5607 done2:
5608 so_unlock_single(so, SOLOCKED);
5609 mutex_exit(&so->so_lock);
5610 return (error);
5611 }
5612
5613 /*
5614 * sotpi_close() is called when the last open reference goes away.
5615 */
5616 /* ARGSUSED */
5617 int
sotpi_close(struct sonode * so,int flag,struct cred * cr)5618 sotpi_close(struct sonode *so, int flag, struct cred *cr)
5619 {
5620 struct vnode *vp = SOTOV(so);
5621 dev_t dev;
5622 int error = 0;
5623 sotpi_info_t *sti = SOTOTPI(so);
5624
5625 dprintso(so, 1, ("sotpi_close(%p, %x) %s\n",
5626 (void *)vp, flag, pr_state(so->so_state, so->so_mode)));
5627
5628 dev = sti->sti_dev;
5629
5630 ASSERT(STREAMSTAB(getmajor(dev)));
5631
5632 mutex_enter(&so->so_lock);
5633 so_lock_single(so); /* Set SOLOCKED */
5634
5635 ASSERT(so_verify_oobstate(so));
5636
5637 if (vp->v_stream != NULL) {
5638 vnode_t *ux_vp;
5639
5640 if (so->so_family == AF_UNIX) {
5641 /* Could avoid this when CANTSENDMORE for !dgram */
5642 so_unix_close(so);
5643 }
5644
5645 mutex_exit(&so->so_lock);
5646 /*
5647 * Disassemble the linkage from the AF_UNIX underlying file
5648 * system vnode to this socket (by atomically clearing
5649 * v_stream in vn_rele_stream) before strclose clears sd_vnode
5650 * and frees the stream head.
5651 */
5652 if ((ux_vp = sti->sti_ux_bound_vp) != NULL) {
5653 ASSERT(ux_vp->v_stream);
5654 sti->sti_ux_bound_vp = NULL;
5655 vn_rele_stream(ux_vp);
5656 }
5657 error = strclose(vp, flag, cr);
5658 vp->v_stream = NULL;
5659 mutex_enter(&so->so_lock);
5660 }
5661
5662 /*
5663 * Flush the T_DISCON_IND on sti_discon_ind_mp.
5664 */
5665 so_flush_discon_ind(so);
5666
5667 so_unlock_single(so, SOLOCKED);
5668 mutex_exit(&so->so_lock);
5669
5670 /*
5671 * Needed for STREAMs.
5672 * Decrement the device driver's reference count for streams
5673 * opened via the clone dip. The driver was held in clone_open().
5674 * The absence of clone_close() forces this asymmetry.
5675 */
5676 if (so->so_flag & SOCLONE)
5677 ddi_rele_driver(getmajor(dev));
5678
5679 return (error);
5680 }
5681
5682 static int
sotpi_ioctl(struct sonode * so,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5683 sotpi_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
5684 struct cred *cr, int32_t *rvalp)
5685 {
5686 struct vnode *vp = SOTOV(so);
5687 sotpi_info_t *sti = SOTOTPI(so);
5688 int error = 0;
5689
5690 dprintso(so, 0, ("sotpi_ioctl: cmd 0x%x, arg 0x%lx, state %s\n",
5691 cmd, arg, pr_state(so->so_state, so->so_mode)));
5692
5693 switch (cmd) {
5694 case SIOCSQPTR:
5695 /*
5696 * SIOCSQPTR is valid only when helper stream is created
5697 * by the protocol.
5698 */
5699 case _I_INSERT:
5700 case _I_REMOVE:
5701 /*
5702 * Since there's no compelling reason to support these ioctls
5703 * on sockets, and doing so would increase the complexity
5704 * markedly, prevent it.
5705 */
5706 return (EOPNOTSUPP);
5707
5708 case I_FIND:
5709 case I_LIST:
5710 case I_LOOK:
5711 case I_POP:
5712 case I_PUSH:
5713 /*
5714 * To prevent races and inconsistencies between the actual
5715 * state of the stream and the state according to the sonode,
5716 * we serialize all operations which modify or operate on the
5717 * list of modules on the socket's stream.
5718 */
5719 mutex_enter(&sti->sti_plumb_lock);
5720 error = socktpi_plumbioctl(vp, cmd, arg, mode, cr, rvalp);
5721 mutex_exit(&sti->sti_plumb_lock);
5722 return (error);
5723
5724 default:
5725 if (so->so_version != SOV_STREAM)
5726 break;
5727
5728 /*
5729 * The imaginary "sockmod" has been popped; act as a stream.
5730 */
5731 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5732 }
5733
5734 ASSERT(so->so_version != SOV_STREAM);
5735
5736 /*
5737 * Process socket-specific ioctls.
5738 */
5739 switch (cmd) {
5740 case FIONBIO: {
5741 int32_t value;
5742
5743 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5744 (mode & (int)FKIOCTL)))
5745 return (EFAULT);
5746
5747 mutex_enter(&so->so_lock);
5748 if (value) {
5749 so->so_state |= SS_NDELAY;
5750 } else {
5751 so->so_state &= ~SS_NDELAY;
5752 }
5753 mutex_exit(&so->so_lock);
5754 return (0);
5755 }
5756
5757 case FIOASYNC: {
5758 int32_t value;
5759
5760 if (so_copyin((void *)arg, &value, sizeof (int32_t),
5761 (mode & (int)FKIOCTL)))
5762 return (EFAULT);
5763
5764 mutex_enter(&so->so_lock);
5765 /*
5766 * SS_ASYNC flag not already set correctly?
5767 * (!value != !(so->so_state & SS_ASYNC))
5768 * but some engineers find that too hard to read.
5769 */
5770 if ((value == 0 && (so->so_state & SS_ASYNC) != 0) ||
5771 (value != 0 && (so->so_state & SS_ASYNC) == 0))
5772 error = so_flip_async(so, vp, mode, cr);
5773 mutex_exit(&so->so_lock);
5774 return (error);
5775 }
5776
5777 case SIOCSPGRP:
5778 case FIOSETOWN: {
5779 pid_t pgrp;
5780
5781 if (so_copyin((void *)arg, &pgrp, sizeof (pid_t),
5782 (mode & (int)FKIOCTL)))
5783 return (EFAULT);
5784
5785 mutex_enter(&so->so_lock);
5786 dprintso(so, 1, ("setown: new %d old %d\n", pgrp, so->so_pgrp));
5787 /* Any change? */
5788 if (pgrp != so->so_pgrp)
5789 error = so_set_siggrp(so, vp, pgrp, mode, cr);
5790 mutex_exit(&so->so_lock);
5791 return (error);
5792 }
5793 case SIOCGPGRP:
5794 case FIOGETOWN:
5795 if (so_copyout(&so->so_pgrp, (void *)arg,
5796 sizeof (pid_t), (mode & (int)FKIOCTL)))
5797 return (EFAULT);
5798 return (0);
5799
5800 case SIOCATMARK: {
5801 int retval;
5802 uint_t so_state;
5803
5804 /*
5805 * strwaitmark has a finite timeout after which it
5806 * returns -1 if the mark state is undetermined.
5807 * In order to avoid any race between the mark state
5808 * in sockfs and the mark state in the stream head this
5809 * routine loops until the mark state can be determined
5810 * (or the urgent data indication has been removed by some
5811 * other thread).
5812 */
5813 do {
5814 mutex_enter(&so->so_lock);
5815 so_state = so->so_state;
5816 mutex_exit(&so->so_lock);
5817 if (so_state & SS_RCVATMARK) {
5818 retval = 1;
5819 } else if (!(so_state & SS_OOBPEND)) {
5820 /*
5821 * No SIGURG has been generated -- there is no
5822 * pending or present urgent data. Thus can't
5823 * possibly be at the mark.
5824 */
5825 retval = 0;
5826 } else {
5827 /*
5828 * Have the stream head wait until there is
5829 * either some messages on the read queue, or
5830 * STRATMARK or STRNOTATMARK gets set. The
5831 * STRNOTATMARK flag is used so that the
5832 * transport can send up a MSGNOTMARKNEXT
5833 * M_DATA to indicate that it is not
5834 * at the mark and additional data is not about
5835 * to be send upstream.
5836 *
5837 * If the mark state is undetermined this will
5838 * return -1 and we will loop rechecking the
5839 * socket state.
5840 */
5841 retval = strwaitmark(vp);
5842 }
5843 } while (retval == -1);
5844
5845 if (so_copyout(&retval, (void *)arg, sizeof (int),
5846 (mode & (int)FKIOCTL)))
5847 return (EFAULT);
5848 return (0);
5849 }
5850
5851 case I_FDINSERT:
5852 case I_SENDFD:
5853 case I_RECVFD:
5854 case I_ATMARK:
5855 case _SIOCSOCKFALLBACK:
5856 /*
5857 * These ioctls do not apply to sockets. I_FDINSERT can be
5858 * used to send M_PROTO messages without modifying the socket
5859 * state. I_SENDFD/RECVFD should not be used for socket file
5860 * descriptor passing since they assume a twisted stream.
5861 * SIOCATMARK must be used instead of I_ATMARK.
5862 *
5863 * _SIOCSOCKFALLBACK from an application should never be
5864 * processed. It is only generated by socktpi_open() or
5865 * in response to I_POP or I_PUSH.
5866 */
5867 #ifdef DEBUG
5868 zcmn_err(getzoneid(), CE_WARN,
5869 "Unsupported STREAMS ioctl 0x%x on socket. "
5870 "Pid = %d\n", cmd, curproc->p_pid);
5871 #endif /* DEBUG */
5872 return (EOPNOTSUPP);
5873
5874 case _I_GETPEERCRED:
5875 if ((mode & FKIOCTL) == 0)
5876 return (EINVAL);
5877
5878 mutex_enter(&so->so_lock);
5879 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
5880 error = ENOTSUP;
5881 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
5882 error = ENOTCONN;
5883 } else if (so->so_peercred != NULL) {
5884 k_peercred_t *kp = (k_peercred_t *)arg;
5885 kp->pc_cr = so->so_peercred;
5886 kp->pc_cpid = so->so_cpid;
5887 crhold(so->so_peercred);
5888 } else {
5889 error = EINVAL;
5890 }
5891 mutex_exit(&so->so_lock);
5892 return (error);
5893
5894 default:
5895 /*
5896 * Do the higher-order bits of the ioctl cmd indicate
5897 * that it is an I_* streams ioctl?
5898 */
5899 if ((cmd & 0xffffff00U) == STR &&
5900 so->so_version == SOV_SOCKBSD) {
5901 #ifdef DEBUG
5902 zcmn_err(getzoneid(), CE_WARN,
5903 "Unsupported STREAMS ioctl 0x%x on socket. "
5904 "Pid = %d\n", cmd, curproc->p_pid);
5905 #endif /* DEBUG */
5906 return (EOPNOTSUPP);
5907 }
5908 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5909 }
5910 }
5911
5912 /*
5913 * Handle plumbing-related ioctls.
5914 */
5915 static int
socktpi_plumbioctl(struct vnode * vp,int cmd,intptr_t arg,int mode,struct cred * cr,int32_t * rvalp)5916 socktpi_plumbioctl(struct vnode *vp, int cmd, intptr_t arg, int mode,
5917 struct cred *cr, int32_t *rvalp)
5918 {
5919 static const char sockmod_name[] = "sockmod";
5920 struct sonode *so = VTOSO(vp);
5921 char mname[FMNAMESZ + 1];
5922 int error;
5923 sotpi_info_t *sti = SOTOTPI(so);
5924
5925 ASSERT(MUTEX_HELD(&sti->sti_plumb_lock));
5926
5927 if (so->so_version == SOV_SOCKBSD)
5928 return (EOPNOTSUPP);
5929
5930 if (so->so_version == SOV_STREAM) {
5931 /*
5932 * The imaginary "sockmod" has been popped - act as a stream.
5933 * If this is a push of sockmod then change back to a socket.
5934 */
5935 if (cmd == I_PUSH) {
5936 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
5937 (void *)arg, mname, sizeof (mname), NULL);
5938
5939 if (error == 0 && strcmp(mname, sockmod_name) == 0) {
5940 dprintso(so, 0, ("socktpi_ioctl: going to "
5941 "socket version\n"));
5942 so_stream2sock(so);
5943 return (0);
5944 }
5945 }
5946 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
5947 }
5948
5949 switch (cmd) {
5950 case I_PUSH:
5951 if (sti->sti_direct) {
5952 mutex_enter(&so->so_lock);
5953 so_lock_single(so);
5954 mutex_exit(&so->so_lock);
5955
5956 error = strioctl(vp, _SIOCSOCKFALLBACK, 0, 0, K_TO_K,
5957 cr, rvalp);
5958
5959 mutex_enter(&so->so_lock);
5960 if (error == 0)
5961 sti->sti_direct = 0;
5962 so_unlock_single(so, SOLOCKED);
5963 mutex_exit(&so->so_lock);
5964
5965 if (error != 0)
5966 return (error);
5967 }
5968
5969 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5970 if (error == 0)
5971 sti->sti_pushcnt++;
5972 return (error);
5973
5974 case I_POP:
5975 if (sti->sti_pushcnt == 0) {
5976 /* Emulate sockmod being popped */
5977 dprintso(so, 0,
5978 ("socktpi_ioctl: going to STREAMS version\n"));
5979 return (so_sock2stream(so));
5980 }
5981
5982 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5983 if (error == 0)
5984 sti->sti_pushcnt--;
5985 return (error);
5986
5987 case I_LIST: {
5988 struct str_mlist *kmlistp, *umlistp;
5989 struct str_list kstrlist;
5990 ssize_t kstrlistsize;
5991 int i, nmods;
5992
5993 STRUCT_DECL(str_list, ustrlist);
5994 STRUCT_INIT(ustrlist, mode);
5995
5996 if (arg == 0) {
5997 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
5998 if (error == 0)
5999 (*rvalp)++; /* Add one for sockmod */
6000 return (error);
6001 }
6002
6003 error = so_copyin((void *)arg, STRUCT_BUF(ustrlist),
6004 STRUCT_SIZE(ustrlist), mode & FKIOCTL);
6005 if (error != 0)
6006 return (error);
6007
6008 nmods = STRUCT_FGET(ustrlist, sl_nmods);
6009 if (nmods <= 0)
6010 return (EINVAL);
6011 /*
6012 * Ceiling nmods at nstrpush to prevent someone from
6013 * maliciously consuming lots of kernel memory.
6014 */
6015 nmods = MIN(nmods, nstrpush);
6016
6017 kstrlistsize = (nmods + 1) * sizeof (struct str_mlist);
6018 kstrlist.sl_nmods = nmods;
6019 kstrlist.sl_modlist = kmem_zalloc(kstrlistsize, KM_SLEEP);
6020
6021 error = strioctl(vp, cmd, (intptr_t)&kstrlist, mode, K_TO_K,
6022 cr, rvalp);
6023 if (error != 0)
6024 goto done;
6025
6026 /*
6027 * Considering the module list as a 0-based array of sl_nmods
6028 * modules, sockmod should conceptually exist at slot
6029 * sti_pushcnt. Insert sockmod at this location by sliding all
6030 * of the module names after so_pushcnt over by one. We know
6031 * that there will be room to do this since we allocated
6032 * sl_modlist with an additional slot.
6033 */
6034 for (i = kstrlist.sl_nmods; i > sti->sti_pushcnt; i--)
6035 kstrlist.sl_modlist[i] = kstrlist.sl_modlist[i - 1];
6036
6037 (void) strcpy(kstrlist.sl_modlist[i].l_name, sockmod_name);
6038 kstrlist.sl_nmods++;
6039
6040 /*
6041 * Copy all of the entries out to ustrlist.
6042 */
6043 kmlistp = kstrlist.sl_modlist;
6044 umlistp = STRUCT_FGETP(ustrlist, sl_modlist);
6045 for (i = 0; i < nmods && i < kstrlist.sl_nmods; i++) {
6046 error = so_copyout(kmlistp++, umlistp++,
6047 sizeof (struct str_mlist), mode & FKIOCTL);
6048 if (error != 0)
6049 goto done;
6050 }
6051
6052 error = so_copyout(&i, (void *)arg, sizeof (int32_t),
6053 mode & FKIOCTL);
6054 if (error == 0)
6055 *rvalp = 0;
6056 done:
6057 kmem_free(kstrlist.sl_modlist, kstrlistsize);
6058 return (error);
6059 }
6060 case I_LOOK:
6061 if (sti->sti_pushcnt == 0) {
6062 return (so_copyout(sockmod_name, (void *)arg,
6063 sizeof (sockmod_name), mode & FKIOCTL));
6064 }
6065 return (strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp));
6066
6067 case I_FIND:
6068 error = strioctl(vp, cmd, arg, mode, U_TO_K, cr, rvalp);
6069 if (error && error != EINVAL)
6070 return (error);
6071
6072 /* if not found and string was sockmod return 1 */
6073 if (*rvalp == 0 || error == EINVAL) {
6074 error = ((mode & FKIOCTL) ? copystr : copyinstr)(
6075 (void *)arg, mname, sizeof (mname), NULL);
6076 if (error == ENAMETOOLONG)
6077 error = EINVAL;
6078
6079 if (error == 0 && strcmp(mname, sockmod_name) == 0)
6080 *rvalp = 1;
6081 }
6082 return (error);
6083
6084 default:
6085 panic("socktpi_plumbioctl: unknown ioctl %d", cmd);
6086 break;
6087 }
6088
6089 return (0);
6090 }
6091
6092 /*
6093 * Wrapper around the streams poll routine that implements socket poll
6094 * semantics.
6095 * The sockfs never calls pollwakeup itself - the stream head take care
6096 * of all pollwakeups. Since sockfs never holds so_lock when calling the
6097 * stream head there can never be a deadlock due to holding so_lock across
6098 * pollwakeup and acquiring so_lock in this routine.
6099 *
6100 * However, since the performance of VOP_POLL is critical we avoid
6101 * acquiring so_lock here. This is based on two assumptions:
6102 * - The poll implementation holds locks to serialize the VOP_POLL call
6103 * and a pollwakeup for the same pollhead. This ensures that should
6104 * e.g. so_state change during a socktpi_poll call the pollwakeup
6105 * (which strsock_* and strrput conspire to issue) is issued after
6106 * the state change. Thus the pollwakeup will block until VOP_POLL has
6107 * returned and then wake up poll and have it call VOP_POLL again.
6108 * - The reading of so_state without holding so_lock does not result in
6109 * stale data that is older than the latest state change that has dropped
6110 * so_lock. This is ensured by the mutex_exit issuing the appropriate
6111 * memory barrier to force the data into the coherency domain.
6112 */
6113 static int
sotpi_poll(struct sonode * so,short events,int anyyet,short * reventsp,struct pollhead ** phpp)6114 sotpi_poll(
6115 struct sonode *so,
6116 short events,
6117 int anyyet,
6118 short *reventsp,
6119 struct pollhead **phpp)
6120 {
6121 short origevents = events;
6122 struct vnode *vp = SOTOV(so);
6123 int error;
6124 int so_state = so->so_state; /* snapshot */
6125 sotpi_info_t *sti = SOTOTPI(so);
6126
6127 dprintso(so, 0, ("socktpi_poll(%p): state %s err %d\n",
6128 (void *)vp, pr_state(so_state, so->so_mode), so->so_error));
6129
6130 ASSERT(vp->v_type == VSOCK);
6131 ASSERT(vp->v_stream != NULL);
6132
6133 if (so->so_version == SOV_STREAM) {
6134 /* The imaginary "sockmod" has been popped - act as a stream */
6135 return (strpoll(vp->v_stream, events, anyyet,
6136 reventsp, phpp));
6137 }
6138
6139 if (!(so_state & SS_ISCONNECTED) &&
6140 (so->so_mode & SM_CONNREQUIRED)) {
6141 /* Not connected yet - turn off write side events */
6142 events &= ~(POLLOUT|POLLWRBAND);
6143 }
6144 /*
6145 * Check for errors without calling strpoll if the caller wants them.
6146 * In sockets the errors are represented as input/output events
6147 * and there is no need to ask the stream head for this information.
6148 */
6149 if (so->so_error != 0 &&
6150 ((POLLIN|POLLRDNORM|POLLOUT) & origevents) != 0) {
6151 *reventsp = (POLLIN|POLLRDNORM|POLLOUT) & origevents;
6152 return (0);
6153 }
6154 /*
6155 * Ignore M_PROTO only messages such as the T_EXDATA_IND messages.
6156 * These message with only an M_PROTO/M_PCPROTO part and no M_DATA
6157 * will not trigger a POLLIN event with POLLRDDATA set.
6158 * The handling of urgent data (causing POLLRDBAND) is done by
6159 * inspecting SS_OOBPEND below.
6160 */
6161 events |= POLLRDDATA;
6162
6163 /*
6164 * After shutdown(output) a stream head write error is set.
6165 * However, we should not return output events.
6166 */
6167 events |= POLLNOERR;
6168 error = strpoll(vp->v_stream, events, anyyet,
6169 reventsp, phpp);
6170 if (error)
6171 return (error);
6172
6173 ASSERT(!(*reventsp & POLLERR));
6174
6175 /*
6176 * Notes on T_CONN_IND handling for sockets.
6177 *
6178 * If strpoll() returned without events, SR_POLLIN is guaranteed
6179 * to be set, ensuring any subsequent strrput() runs pollwakeup().
6180 *
6181 * Since the so_lock is not held, soqueueconnind() may have run
6182 * and a T_CONN_IND may be waiting. We now check for any queued
6183 * T_CONN_IND msgs on sti_conn_ind_head and set appropriate events
6184 * to ensure poll returns.
6185 *
6186 * However:
6187 * If the T_CONN_IND hasn't arrived by the time strpoll() returns,
6188 * when strrput() does run for an arriving M_PROTO with T_CONN_IND
6189 * the following actions will occur; taken together they ensure the
6190 * syscall will return.
6191 *
6192 * 1. If a socket, soqueueconnind() will queue the T_CONN_IND but if
6193 * the accept() was run on a non-blocking socket sowaitconnind()
6194 * may have already returned EWOULDBLOCK, so not be waiting to
6195 * process the message. Additionally socktpi_poll() has probably
6196 * proceeded past the sti_conn_ind_head check below.
6197 * 2. strrput() runs pollwakeup()->pollnotify()->cv_signal() to wake
6198 * this thread, however that could occur before poll_common()
6199 * has entered cv_wait.
6200 * 3. pollnotify() sets T_POLLWAKE, while holding the pc_lock.
6201 *
6202 * Before proceeding to cv_wait() in poll_common() for an event,
6203 * poll_common() atomically checks for T_POLLWAKE under the pc_lock,
6204 * and if set, re-calls strpoll() to ensure the late arriving
6205 * T_CONN_IND is recognized, and pollsys() returns.
6206 */
6207
6208 if (sti->sti_conn_ind_head != NULL)
6209 *reventsp |= (POLLIN|POLLRDNORM) & events;
6210
6211 if (so->so_state & SS_CANTRCVMORE) {
6212 *reventsp |= POLLRDHUP & events;
6213
6214 if (so->so_state & SS_CANTSENDMORE)
6215 *reventsp |= POLLHUP;
6216 }
6217
6218 if (so->so_state & SS_OOBPEND)
6219 *reventsp |= POLLRDBAND & events;
6220
6221 return (0);
6222 }
6223
6224 /*ARGSUSED*/
6225 static int
socktpi_constructor(void * buf,void * cdrarg,int kmflags)6226 socktpi_constructor(void *buf, void *cdrarg, int kmflags)
6227 {
6228 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6229 int error = 0;
6230
6231 error = sonode_constructor(buf, cdrarg, kmflags);
6232 if (error != 0)
6233 return (error);
6234
6235 error = i_sotpi_info_constructor(&st->st_info);
6236 if (error != 0)
6237 sonode_destructor(buf, cdrarg);
6238
6239 st->st_sonode.so_priv = &st->st_info;
6240
6241 return (error);
6242 }
6243
6244 /*ARGSUSED1*/
6245 static void
socktpi_destructor(void * buf,void * cdrarg)6246 socktpi_destructor(void *buf, void *cdrarg)
6247 {
6248 sotpi_sonode_t *st = (sotpi_sonode_t *)buf;
6249
6250 ASSERT(st->st_sonode.so_priv == &st->st_info);
6251 st->st_sonode.so_priv = NULL;
6252
6253 i_sotpi_info_destructor(&st->st_info);
6254 sonode_destructor(buf, cdrarg);
6255 }
6256
6257 static int
socktpi_unix_constructor(void * buf,void * cdrarg,int kmflags)6258 socktpi_unix_constructor(void *buf, void *cdrarg, int kmflags)
6259 {
6260 int retval;
6261
6262 if ((retval = socktpi_constructor(buf, cdrarg, kmflags)) == 0) {
6263 struct sonode *so = (struct sonode *)buf;
6264 sotpi_info_t *sti = SOTOTPI(so);
6265
6266 mutex_enter(&socklist.sl_lock);
6267
6268 sti->sti_next_so = socklist.sl_list;
6269 sti->sti_prev_so = NULL;
6270 if (sti->sti_next_so != NULL)
6271 SOTOTPI(sti->sti_next_so)->sti_prev_so = so;
6272 socklist.sl_list = so;
6273
6274 mutex_exit(&socklist.sl_lock);
6275
6276 }
6277 return (retval);
6278 }
6279
6280 static void
socktpi_unix_destructor(void * buf,void * cdrarg)6281 socktpi_unix_destructor(void *buf, void *cdrarg)
6282 {
6283 struct sonode *so = (struct sonode *)buf;
6284 sotpi_info_t *sti = SOTOTPI(so);
6285
6286 mutex_enter(&socklist.sl_lock);
6287
6288 if (sti->sti_next_so != NULL)
6289 SOTOTPI(sti->sti_next_so)->sti_prev_so = sti->sti_prev_so;
6290 if (sti->sti_prev_so != NULL)
6291 SOTOTPI(sti->sti_prev_so)->sti_next_so = sti->sti_next_so;
6292 else
6293 socklist.sl_list = sti->sti_next_so;
6294
6295 mutex_exit(&socklist.sl_lock);
6296
6297 socktpi_destructor(buf, cdrarg);
6298 }
6299
6300 int
socktpi_init(void)6301 socktpi_init(void)
6302 {
6303 /*
6304 * Create sonode caches. We create a special one for AF_UNIX so
6305 * that we can track them for netstat(8).
6306 */
6307 socktpi_cache = kmem_cache_create("socktpi_cache",
6308 sizeof (struct sotpi_sonode), 0, socktpi_constructor,
6309 socktpi_destructor, NULL, NULL, NULL, 0);
6310
6311 socktpi_unix_cache = kmem_cache_create("socktpi_unix_cache",
6312 sizeof (struct sotpi_sonode), 0, socktpi_unix_constructor,
6313 socktpi_unix_destructor, NULL, NULL, NULL, 0);
6314
6315 return (0);
6316 }
6317
6318 /*
6319 * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
6320 *
6321 * Caller must still update state and mode using sotpi_update_state().
6322 */
6323 int
sotpi_convert_sonode(struct sonode * so,struct sockparams * newsp,boolean_t * direct,queue_t ** qp,struct cred * cr)6324 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
6325 boolean_t *direct, queue_t **qp, struct cred *cr)
6326 {
6327 sotpi_info_t *sti;
6328 struct sockparams *origsp = so->so_sockparams;
6329 sock_lower_handle_t handle = so->so_proto_handle;
6330 struct stdata *stp;
6331 struct vnode *vp;
6332 queue_t *q;
6333 int error = 0;
6334
6335 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6336 SS_FALLBACK_PENDING);
6337 ASSERT(SOCK_IS_NONSTR(so));
6338
6339 *qp = NULL;
6340 *direct = B_FALSE;
6341 so->so_sockparams = newsp;
6342 /*
6343 * Allocate and initalize fields required by TPI.
6344 */
6345 (void) sotpi_info_create(so, KM_SLEEP);
6346 sotpi_info_init(so);
6347
6348 if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
6349 sotpi_info_fini(so);
6350 sotpi_info_destroy(so);
6351 return (error);
6352 }
6353 ASSERT(handle == so->so_proto_handle);
6354 sti = SOTOTPI(so);
6355 if (sti->sti_direct != 0)
6356 *direct = B_TRUE;
6357
6358 /*
6359 * Keep the original sp around so we can properly dispose of the
6360 * sonode when the socket is being closed.
6361 */
6362 sti->sti_orig_sp = origsp;
6363
6364 so_basic_strinit(so); /* skips the T_CAPABILITY_REQ */
6365 so_alloc_addr(so, so->so_max_addr_len);
6366
6367 /*
6368 * If the application has done a SIOCSPGRP, make sure the
6369 * STREAM head is aware. This needs to take place before
6370 * the protocol start sending up messages. Otherwise we
6371 * might miss to generate SIGPOLL.
6372 *
6373 * It is possible that the application will receive duplicate
6374 * signals if some were already generated for either data or
6375 * connection indications.
6376 */
6377 if (so->so_pgrp != 0) {
6378 if (so_set_events(so, so->so_vnode, cr) != 0)
6379 so->so_pgrp = 0;
6380 }
6381
6382 /*
6383 * Determine which queue to use.
6384 */
6385 vp = SOTOV(so);
6386 stp = vp->v_stream;
6387 ASSERT(stp != NULL);
6388 q = stp->sd_wrq->q_next;
6389
6390 /*
6391 * Skip any modules that may have been auto pushed when the device
6392 * was opened
6393 */
6394 while (q->q_next != NULL)
6395 q = q->q_next;
6396 *qp = _RD(q);
6397
6398 /* This is now a STREAMS sockets */
6399 so->so_not_str = B_FALSE;
6400
6401 return (error);
6402 }
6403
6404 /*
6405 * Revert a TPI sonode. It is only allowed to revert the sonode during
6406 * the fallback process.
6407 */
6408 void
sotpi_revert_sonode(struct sonode * so,struct cred * cr)6409 sotpi_revert_sonode(struct sonode *so, struct cred *cr)
6410 {
6411 vnode_t *vp = SOTOV(so);
6412
6413 ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
6414 SS_FALLBACK_PENDING);
6415 ASSERT(!SOCK_IS_NONSTR(so));
6416 ASSERT(vp->v_stream != NULL);
6417
6418 strclean(vp);
6419 (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
6420
6421 /*
6422 * Restore the original sockparams. The caller is responsible for
6423 * dropping the ref to the new sp.
6424 */
6425 so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
6426
6427 sotpi_info_fini(so);
6428 sotpi_info_destroy(so);
6429
6430 /* This is no longer a STREAMS sockets */
6431 so->so_not_str = B_TRUE;
6432 }
6433
6434 void
sotpi_update_state(struct sonode * so,struct T_capability_ack * tcap,struct sockaddr * laddr,socklen_t laddrlen,struct sockaddr * faddr,socklen_t faddrlen,short opts)6435 sotpi_update_state(struct sonode *so, struct T_capability_ack *tcap,
6436 struct sockaddr *laddr, socklen_t laddrlen, struct sockaddr *faddr,
6437 socklen_t faddrlen, short opts)
6438 {
6439 sotpi_info_t *sti = SOTOTPI(so);
6440
6441 so_proc_tcapability_ack(so, tcap);
6442
6443 so->so_options |= opts;
6444
6445 /*
6446 * Determine whether the foreign and local address are valid
6447 */
6448 if (laddrlen != 0) {
6449 ASSERT(laddrlen <= sti->sti_laddr_maxlen);
6450 sti->sti_laddr_len = laddrlen;
6451 bcopy(laddr, sti->sti_laddr_sa, laddrlen);
6452 sti->sti_laddr_valid = (so->so_state & SS_ISBOUND);
6453 }
6454
6455 if (faddrlen != 0) {
6456 ASSERT(faddrlen <= sti->sti_faddr_maxlen);
6457 sti->sti_faddr_len = faddrlen;
6458 bcopy(faddr, sti->sti_faddr_sa, faddrlen);
6459 sti->sti_faddr_valid = (so->so_state & SS_ISCONNECTED);
6460 }
6461
6462 }
6463
6464 /*
6465 * Allocate enough space to cache the local and foreign addresses.
6466 */
6467 void
so_alloc_addr(struct sonode * so,t_uscalar_t maxlen)6468 so_alloc_addr(struct sonode *so, t_uscalar_t maxlen)
6469 {
6470 sotpi_info_t *sti = SOTOTPI(so);
6471
6472 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6473 ASSERT(sti->sti_laddr_len == 0 && sti->sti_faddr_len == 0);
6474 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen =
6475 P2ROUNDUP(maxlen, KMEM_ALIGN);
6476 so->so_max_addr_len = sti->sti_laddr_maxlen;
6477 sti->sti_laddr_sa = kmem_alloc(sti->sti_laddr_maxlen * 2, KM_SLEEP);
6478 sti->sti_faddr_sa = (struct sockaddr *)((caddr_t)sti->sti_laddr_sa
6479 + sti->sti_laddr_maxlen);
6480
6481 if (so->so_family == AF_UNIX) {
6482 /*
6483 * Initialize AF_UNIX related fields.
6484 */
6485 bzero(&sti->sti_ux_laddr, sizeof (sti->sti_ux_laddr));
6486 bzero(&sti->sti_ux_faddr, sizeof (sti->sti_ux_faddr));
6487 }
6488 }
6489
6490
6491 sotpi_info_t *
sotpi_sototpi(struct sonode * so)6492 sotpi_sototpi(struct sonode *so)
6493 {
6494 sotpi_info_t *sti;
6495
6496 ASSERT(so != NULL);
6497
6498 sti = (sotpi_info_t *)so->so_priv;
6499
6500 ASSERT(sti != NULL);
6501 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6502
6503 return (sti);
6504 }
6505
6506 static int
i_sotpi_info_constructor(sotpi_info_t * sti)6507 i_sotpi_info_constructor(sotpi_info_t *sti)
6508 {
6509 sti->sti_magic = SOTPI_INFO_MAGIC;
6510 sti->sti_ack_mp = NULL;
6511 sti->sti_discon_ind_mp = NULL;
6512 sti->sti_ux_bound_vp = NULL;
6513 sti->sti_unbind_mp = NULL;
6514
6515 sti->sti_conn_ind_head = NULL;
6516 sti->sti_conn_ind_tail = NULL;
6517
6518 sti->sti_laddr_sa = NULL;
6519 sti->sti_faddr_sa = NULL;
6520
6521 mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
6522 cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
6523
6524 return (0);
6525 }
6526
6527 static void
i_sotpi_info_destructor(sotpi_info_t * sti)6528 i_sotpi_info_destructor(sotpi_info_t *sti)
6529 {
6530 ASSERT(sti->sti_magic == SOTPI_INFO_MAGIC);
6531 ASSERT(sti->sti_ack_mp == NULL);
6532 ASSERT(sti->sti_discon_ind_mp == NULL);
6533 ASSERT(sti->sti_ux_bound_vp == NULL);
6534 ASSERT(sti->sti_unbind_mp == NULL);
6535
6536 ASSERT(sti->sti_conn_ind_head == NULL);
6537 ASSERT(sti->sti_conn_ind_tail == NULL);
6538
6539 ASSERT(sti->sti_laddr_sa == NULL);
6540 ASSERT(sti->sti_faddr_sa == NULL);
6541
6542 mutex_destroy(&sti->sti_plumb_lock);
6543 cv_destroy(&sti->sti_ack_cv);
6544 }
6545
6546 /*
6547 * Creates and attaches TPI information to the given sonode
6548 */
6549 static boolean_t
sotpi_info_create(struct sonode * so,int kmflags)6550 sotpi_info_create(struct sonode *so, int kmflags)
6551 {
6552 sotpi_info_t *sti;
6553
6554 ASSERT(so->so_priv == NULL);
6555
6556 if ((sti = kmem_zalloc(sizeof (*sti), kmflags)) == NULL)
6557 return (B_FALSE);
6558
6559 if (i_sotpi_info_constructor(sti) != 0) {
6560 kmem_free(sti, sizeof (*sti));
6561 return (B_FALSE);
6562 }
6563
6564 so->so_priv = (void *)sti;
6565 return (B_TRUE);
6566 }
6567
6568 /*
6569 * Initializes the TPI information.
6570 */
6571 static void
sotpi_info_init(struct sonode * so)6572 sotpi_info_init(struct sonode *so)
6573 {
6574 struct vnode *vp = SOTOV(so);
6575 sotpi_info_t *sti = SOTOTPI(so);
6576 time_t now;
6577
6578 sti->sti_dev = so->so_sockparams->sp_sdev_info.sd_vnode->v_rdev;
6579 vp->v_rdev = sti->sti_dev;
6580
6581 sti->sti_orig_sp = NULL;
6582
6583 sti->sti_pushcnt = 0;
6584
6585 now = gethrestime_sec();
6586 sti->sti_atime = now;
6587 sti->sti_mtime = now;
6588 sti->sti_ctime = now;
6589
6590 sti->sti_eaddr_mp = NULL;
6591 sti->sti_delayed_error = 0;
6592
6593 sti->sti_provinfo = NULL;
6594
6595 sti->sti_oobcnt = 0;
6596 sti->sti_oobsigcnt = 0;
6597
6598 ASSERT(sti->sti_laddr_sa == NULL && sti->sti_faddr_sa == NULL);
6599
6600 sti->sti_laddr_sa = 0;
6601 sti->sti_faddr_sa = 0;
6602 sti->sti_laddr_maxlen = sti->sti_faddr_maxlen = 0;
6603 sti->sti_laddr_len = sti->sti_faddr_len = 0;
6604
6605 sti->sti_laddr_valid = 0;
6606 sti->sti_faddr_valid = 0;
6607 sti->sti_faddr_noxlate = 0;
6608
6609 sti->sti_direct = 0;
6610
6611 ASSERT(sti->sti_ack_mp == NULL);
6612 ASSERT(sti->sti_ux_bound_vp == NULL);
6613 ASSERT(sti->sti_unbind_mp == NULL);
6614
6615 ASSERT(sti->sti_conn_ind_head == NULL);
6616 ASSERT(sti->sti_conn_ind_tail == NULL);
6617 }
6618
6619 /*
6620 * Given a sonode, grab the TPI info and free any data.
6621 */
6622 static void
sotpi_info_fini(struct sonode * so)6623 sotpi_info_fini(struct sonode *so)
6624 {
6625 sotpi_info_t *sti = SOTOTPI(so);
6626 mblk_t *mp;
6627
6628 ASSERT(sti->sti_discon_ind_mp == NULL);
6629
6630 if ((mp = sti->sti_conn_ind_head) != NULL) {
6631 mblk_t *mp1;
6632
6633 while (mp) {
6634 mp1 = mp->b_next;
6635 mp->b_next = NULL;
6636 freemsg(mp);
6637 mp = mp1;
6638 }
6639 sti->sti_conn_ind_head = sti->sti_conn_ind_tail = NULL;
6640 }
6641
6642 /*
6643 * Protect so->so_[lf]addr_sa so that sockfs_snapshot() can safely
6644 * indirect them. It also uses so_count as a validity test.
6645 */
6646 mutex_enter(&so->so_lock);
6647
6648 if (sti->sti_laddr_sa) {
6649 ASSERT((caddr_t)sti->sti_faddr_sa ==
6650 (caddr_t)sti->sti_laddr_sa + sti->sti_laddr_maxlen);
6651 ASSERT(sti->sti_faddr_maxlen == sti->sti_laddr_maxlen);
6652 sti->sti_laddr_valid = 0;
6653 sti->sti_faddr_valid = 0;
6654 kmem_free(sti->sti_laddr_sa, sti->sti_laddr_maxlen * 2);
6655 sti->sti_laddr_sa = NULL;
6656 sti->sti_laddr_len = sti->sti_laddr_maxlen = 0;
6657 sti->sti_faddr_sa = NULL;
6658 sti->sti_faddr_len = sti->sti_faddr_maxlen = 0;
6659 }
6660
6661 mutex_exit(&so->so_lock);
6662
6663 if ((mp = sti->sti_eaddr_mp) != NULL) {
6664 freemsg(mp);
6665 sti->sti_eaddr_mp = NULL;
6666 sti->sti_delayed_error = 0;
6667 }
6668
6669 if ((mp = sti->sti_ack_mp) != NULL) {
6670 freemsg(mp);
6671 sti->sti_ack_mp = NULL;
6672 }
6673
6674 ASSERT(sti->sti_ux_bound_vp == NULL);
6675 if ((mp = sti->sti_unbind_mp) != NULL) {
6676 freemsg(mp);
6677 sti->sti_unbind_mp = NULL;
6678 }
6679 }
6680
6681 /*
6682 * Destroys the TPI information attached to a sonode.
6683 */
6684 static void
sotpi_info_destroy(struct sonode * so)6685 sotpi_info_destroy(struct sonode *so)
6686 {
6687 sotpi_info_t *sti = SOTOTPI(so);
6688
6689 i_sotpi_info_destructor(sti);
6690 kmem_free(sti, sizeof (*sti));
6691
6692 so->so_priv = NULL;
6693 }
6694
6695 /*
6696 * Create the global sotpi socket module entry. It will never be freed.
6697 */
6698 smod_info_t *
sotpi_smod_create(void)6699 sotpi_smod_create(void)
6700 {
6701 smod_info_t *smodp;
6702
6703 smodp = kmem_zalloc(sizeof (*smodp), KM_SLEEP);
6704 smodp->smod_name = kmem_alloc(sizeof (SOTPI_SMOD_NAME), KM_SLEEP);
6705 (void) strcpy(smodp->smod_name, SOTPI_SMOD_NAME);
6706 /*
6707 * Initialize the smod_refcnt to 1 so it will never be freed.
6708 */
6709 smodp->smod_refcnt = 1;
6710 smodp->smod_uc_version = SOCK_UC_VERSION;
6711 smodp->smod_dc_version = SOCK_DC_VERSION;
6712 smodp->smod_sock_create_func = &sotpi_create;
6713 smodp->smod_sock_destroy_func = &sotpi_destroy;
6714 return (smodp);
6715 }
6716