1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2015 by Delphix. All rights reserved.
24 * Copyright 2024 Oxide Computer Company
25 */
26
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/strsubr.h>
31 #include <sys/debug.h>
32 #include <sys/sdt.h>
33 #include <sys/cmn_err.h>
34 #include <sys/tihdr.h>
35
36 #include <inet/common.h>
37 #include <inet/optcom.h>
38 #include <inet/ip.h>
39 #include <inet/ip_if.h>
40 #include <inet/ip_impl.h>
41 #include <inet/tcp.h>
42 #include <inet/tcp_impl.h>
43 #include <inet/ipsec_impl.h>
44 #include <inet/ipclassifier.h>
45 #include <inet/ipp_common.h>
46 #include <inet/ip_if.h>
47
48 /*
49 * This file implements TCP fusion - a protocol-less data path for TCP
50 * loopback connections. The fusion of two local TCP endpoints occurs
51 * at connection establishment time. Various conditions (see details
52 * in tcp_fuse()) need to be met for fusion to be successful. If it
53 * fails, we fall back to the regular TCP data path; if it succeeds,
54 * both endpoints proceed to use tcp_fuse_output() as the transmit path.
55 * tcp_fuse_output() enqueues application data directly onto the peer's
56 * receive queue; no protocol processing is involved.
57 *
58 * Sychronization is handled by squeue and the mutex tcp_non_sq_lock.
59 * One of the requirements for fusion to succeed is that both endpoints
60 * need to be using the same squeue. This ensures that neither side
61 * can disappear while the other side is still sending data. Flow
62 * control information is manipulated outside the squeue, so the
63 * tcp_non_sq_lock must be held when touching tcp_flow_stopped.
64 */
65
66 /*
67 * Setting this to false means we disable fusion altogether and
68 * loopback connections would go through the protocol paths.
69 */
70 boolean_t do_tcp_fusion = B_TRUE;
71
72 /*
73 * This routine gets called by the eager tcp upon changing state from
74 * SYN_RCVD to ESTABLISHED. It fuses a direct path between itself
75 * and the active connect tcp such that the regular tcp processings
76 * may be bypassed under allowable circumstances. Because the fusion
77 * requires both endpoints to be in the same squeue, it does not work
78 * for simultaneous active connects because there is no easy way to
79 * switch from one squeue to another once the connection is created.
80 * This is different from the eager tcp case where we assign it the
81 * same squeue as the one given to the active connect tcp during open.
82 */
83 void
tcp_fuse(tcp_t * tcp,uchar_t * iphdr,tcpha_t * tcpha)84 tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcpha_t *tcpha)
85 {
86 conn_t *peer_connp, *connp = tcp->tcp_connp;
87 tcp_t *peer_tcp;
88 tcp_stack_t *tcps = tcp->tcp_tcps;
89 netstack_t *ns;
90 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
91
92 ASSERT(!tcp->tcp_fused);
93 ASSERT(tcp->tcp_loopback);
94 ASSERT(tcp->tcp_loopback_peer == NULL);
95 /*
96 * We need to inherit conn_rcvbuf of the listener tcp,
97 * but we can't really use tcp_listener since we get here after
98 * sending up T_CONN_IND and tcp_tli_accept() may be called
99 * independently, at which point tcp_listener is cleared;
100 * this is why we use tcp_saved_listener. The listener itself
101 * is guaranteed to be around until tcp_accept_finish() is called
102 * on this eager -- this won't happen until we're done since we're
103 * inside the eager's perimeter now.
104 */
105 ASSERT(tcp->tcp_saved_listener != NULL);
106 /*
107 * Lookup peer endpoint; search for the remote endpoint having
108 * the reversed address-port quadruplet in ESTABLISHED state,
109 * which is guaranteed to be unique in the system. Zone check
110 * is applied accordingly for loopback address, but not for
111 * local address since we want fusion to happen across Zones.
112 */
113 if (connp->conn_ipversion == IPV4_VERSION) {
114 peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp,
115 (ipha_t *)iphdr, tcpha, ipst);
116 } else {
117 peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp,
118 (ip6_t *)iphdr, tcpha, ipst);
119 }
120
121 /*
122 * We can only proceed if peer exists, resides in the same squeue
123 * as our conn and is not raw-socket. We also restrict fusion to
124 * endpoints of the same type (STREAMS or non-STREAMS). The squeue
125 * assignment of this eager tcp was done earlier at the time of SYN
126 * processing in ip_fanout_tcp{_v6}. Note that similar squeues by
127 * itself doesn't guarantee a safe condition to fuse, hence we perform
128 * additional tests below.
129 */
130 ASSERT(peer_connp == NULL || peer_connp != connp);
131 if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp ||
132 !IPCL_IS_TCP(peer_connp) ||
133 IPCL_IS_NONSTR(connp) != IPCL_IS_NONSTR(peer_connp)) {
134 if (peer_connp != NULL) {
135 TCP_STAT(tcps, tcp_fusion_unqualified);
136 CONN_DEC_REF(peer_connp);
137 }
138 return;
139 }
140 peer_tcp = peer_connp->conn_tcp; /* active connect tcp */
141
142 ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused);
143 ASSERT(peer_tcp->tcp_loopback_peer == NULL);
144 ASSERT(peer_connp->conn_sqp == connp->conn_sqp);
145
146 /*
147 * Due to IRE changes the peer and us might not agree on tcp_loopback.
148 * We bail in that case.
149 */
150 if (!peer_tcp->tcp_loopback) {
151 TCP_STAT(tcps, tcp_fusion_unqualified);
152 CONN_DEC_REF(peer_connp);
153 return;
154 }
155
156 /*
157 * If we need to add MD5 Signature options, don't allow fusion.
158 */
159 if (tcp->tcp_md5sig || peer_tcp->tcp_md5sig) {
160 TCP_STAT(tcps, tcp_fusion_unqualified);
161 CONN_DEC_REF(peer_connp);
162 return;
163 }
164
165 /*
166 * Fuse the endpoints; we perform further checks against both
167 * tcp endpoints to ensure that a fusion is allowed to happen.
168 */
169 ns = tcps->tcps_netstack;
170 ipst = ns->netstack_ip;
171
172 if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable &&
173 tcp->tcp_xmit_head == NULL && peer_tcp->tcp_xmit_head == NULL) {
174 mblk_t *mp = NULL;
175 queue_t *peer_rq = peer_connp->conn_rq;
176
177 ASSERT(!TCP_IS_DETACHED(peer_tcp));
178 ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
179 ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL);
180
181 /*
182 * We need to drain data on both endpoints during unfuse.
183 * If we need to send up SIGURG at the time of draining,
184 * we want to be sure that an mblk is readily available.
185 * This is why we pre-allocate the M_PCSIG mblks for both
186 * endpoints which will only be used during/after unfuse.
187 * The mblk might already exist if we are doing a re-fuse.
188 */
189 if (!IPCL_IS_NONSTR(tcp->tcp_connp)) {
190 ASSERT(!IPCL_IS_NONSTR(peer_tcp->tcp_connp));
191
192 if (tcp->tcp_fused_sigurg_mp == NULL) {
193 if ((mp = allocb(1, BPRI_HI)) == NULL)
194 goto failed;
195 tcp->tcp_fused_sigurg_mp = mp;
196 }
197
198 if (peer_tcp->tcp_fused_sigurg_mp == NULL) {
199 if ((mp = allocb(1, BPRI_HI)) == NULL)
200 goto failed;
201 peer_tcp->tcp_fused_sigurg_mp = mp;
202 }
203
204 if ((mp = allocb(sizeof (struct stroptions),
205 BPRI_HI)) == NULL)
206 goto failed;
207 }
208
209 /* Fuse both endpoints */
210 peer_tcp->tcp_loopback_peer = tcp;
211 tcp->tcp_loopback_peer = peer_tcp;
212 peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE;
213
214 /*
215 * We never use regular tcp paths in fusion and should
216 * therefore clear tcp_unsent on both endpoints. Having
217 * them set to non-zero values means asking for trouble
218 * especially after unfuse, where we may end up sending
219 * through regular tcp paths which expect xmit_list and
220 * friends to be correctly setup.
221 */
222 peer_tcp->tcp_unsent = tcp->tcp_unsent = 0;
223
224 tcp_timers_stop(tcp);
225 tcp_timers_stop(peer_tcp);
226
227 /*
228 * Set receive buffer and max packet size for the
229 * active open tcp.
230 * eager's values will be set in tcp_accept_finish.
231 */
232 (void) tcp_rwnd_set(peer_tcp, peer_tcp->tcp_connp->conn_rcvbuf);
233
234 /*
235 * Set the write offset value to zero since we won't
236 * be needing any room for TCP/IP headers.
237 */
238 if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) {
239 struct stroptions *stropt;
240
241 DB_TYPE(mp) = M_SETOPTS;
242 mp->b_wptr += sizeof (*stropt);
243
244 stropt = (struct stroptions *)mp->b_rptr;
245 stropt->so_flags = SO_WROFF | SO_MAXBLK;
246 stropt->so_wroff = 0;
247 stropt->so_maxblk = INFPSZ;
248
249 /* Send the options up */
250 putnext(peer_rq, mp);
251 } else {
252 struct sock_proto_props sopp;
253
254 /* The peer is a non-STREAMS end point */
255 ASSERT(IPCL_IS_TCP(peer_connp));
256
257 sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_MAXBLK;
258 sopp.sopp_wroff = 0;
259 sopp.sopp_maxblk = INFPSZ;
260 (*peer_connp->conn_upcalls->su_set_proto_props)
261 (peer_connp->conn_upper_handle, &sopp);
262 }
263 } else {
264 TCP_STAT(tcps, tcp_fusion_unqualified);
265 }
266 CONN_DEC_REF(peer_connp);
267 return;
268
269 failed:
270 if (tcp->tcp_fused_sigurg_mp != NULL) {
271 freeb(tcp->tcp_fused_sigurg_mp);
272 tcp->tcp_fused_sigurg_mp = NULL;
273 }
274 if (peer_tcp->tcp_fused_sigurg_mp != NULL) {
275 freeb(peer_tcp->tcp_fused_sigurg_mp);
276 peer_tcp->tcp_fused_sigurg_mp = NULL;
277 }
278 CONN_DEC_REF(peer_connp);
279 }
280
281 /*
282 * Unfuse a previously-fused pair of tcp loopback endpoints.
283 */
284 void
tcp_unfuse(tcp_t * tcp)285 tcp_unfuse(tcp_t *tcp)
286 {
287 tcp_t *peer_tcp = tcp->tcp_loopback_peer;
288 tcp_stack_t *tcps = tcp->tcp_tcps;
289
290 ASSERT(tcp->tcp_fused && peer_tcp != NULL);
291 ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp);
292 ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp);
293 ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0);
294
295 /*
296 * Cancel any pending push timers.
297 */
298 if (tcp->tcp_push_tid != 0) {
299 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
300 tcp->tcp_push_tid = 0;
301 }
302 if (peer_tcp->tcp_push_tid != 0) {
303 (void) TCP_TIMER_CANCEL(peer_tcp, peer_tcp->tcp_push_tid);
304 peer_tcp->tcp_push_tid = 0;
305 }
306
307 /*
308 * Drain any pending data; Note that in case of a detached tcp, the
309 * draining will happen later after the tcp is unfused. For non-
310 * urgent data, this can be handled by the regular tcp_rcv_drain().
311 * If we have urgent data sitting in the receive list, we will
312 * need to send up a SIGURG signal first before draining the data.
313 * All of these will be handled by the code in tcp_fuse_rcv_drain()
314 * when called from tcp_rcv_drain().
315 */
316 if (!TCP_IS_DETACHED(tcp)) {
317 (void) tcp_fuse_rcv_drain(tcp->tcp_connp->conn_rq, tcp,
318 &tcp->tcp_fused_sigurg_mp);
319 }
320 if (!TCP_IS_DETACHED(peer_tcp)) {
321 (void) tcp_fuse_rcv_drain(peer_tcp->tcp_connp->conn_rq,
322 peer_tcp, &peer_tcp->tcp_fused_sigurg_mp);
323 }
324
325 /* Lift up any flow-control conditions */
326 mutex_enter(&tcp->tcp_non_sq_lock);
327 if (tcp->tcp_flow_stopped) {
328 tcp_clrqfull(tcp);
329 TCP_STAT(tcps, tcp_fusion_backenabled);
330 }
331 mutex_exit(&tcp->tcp_non_sq_lock);
332
333 mutex_enter(&peer_tcp->tcp_non_sq_lock);
334 if (peer_tcp->tcp_flow_stopped) {
335 tcp_clrqfull(peer_tcp);
336 TCP_STAT(tcps, tcp_fusion_backenabled);
337 }
338 mutex_exit(&peer_tcp->tcp_non_sq_lock);
339
340 /*
341 * Update tha_seq and tha_ack in the header template
342 */
343 tcp->tcp_tcpha->tha_seq = htonl(tcp->tcp_snxt);
344 tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt);
345 peer_tcp->tcp_tcpha->tha_seq = htonl(peer_tcp->tcp_snxt);
346 peer_tcp->tcp_tcpha->tha_ack = htonl(peer_tcp->tcp_rnxt);
347
348 /* Unfuse the endpoints */
349 peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE;
350 peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL;
351 }
352
353 /*
354 * Fusion output routine used to handle urgent data sent by STREAMS based
355 * endpoints. This routine is called by tcp_fuse_output() for handling
356 * non-M_DATA mblks.
357 */
358 void
tcp_fuse_output_urg(tcp_t * tcp,mblk_t * mp)359 tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp)
360 {
361 mblk_t *mp1;
362 struct T_exdata_ind *tei;
363 tcp_t *peer_tcp = tcp->tcp_loopback_peer;
364 mblk_t *head, *prev_head = NULL;
365 tcp_stack_t *tcps = tcp->tcp_tcps;
366
367 ASSERT(tcp->tcp_fused);
368 ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
369 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
370 ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
371 ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA);
372 ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0);
373
374 /*
375 * Urgent data arrives in the form of T_EXDATA_REQ from above.
376 * Each occurence denotes a new urgent pointer. For each new
377 * urgent pointer we signal (SIGURG) the receiving app to indicate
378 * that it needs to go into urgent mode. This is similar to the
379 * urgent data handling in the regular tcp. We don't need to keep
380 * track of where the urgent pointer is, because each T_EXDATA_REQ
381 * "advances" the urgent pointer for us.
382 *
383 * The actual urgent data carried by T_EXDATA_REQ is then prepended
384 * by a T_EXDATA_IND before being enqueued behind any existing data
385 * destined for the receiving app. There is only a single urgent
386 * pointer (out-of-band mark) for a given tcp. If the new urgent
387 * data arrives before the receiving app reads some existing urgent
388 * data, the previous marker is lost. This behavior is emulated
389 * accordingly below, by removing any existing T_EXDATA_IND messages
390 * and essentially converting old urgent data into non-urgent.
391 */
392 ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID);
393 /* Let sender get out of urgent mode */
394 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
395
396 /*
397 * This flag indicates that a signal needs to be sent up.
398 * This flag will only get cleared once SIGURG is delivered and
399 * is not affected by the tcp_fused flag -- delivery will still
400 * happen even after an endpoint is unfused, to handle the case
401 * where the sending endpoint immediately closes/unfuses after
402 * sending urgent data and the accept is not yet finished.
403 */
404 peer_tcp->tcp_fused_sigurg = B_TRUE;
405
406 /* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */
407 DB_TYPE(mp) = M_PROTO;
408 tei = (struct T_exdata_ind *)mp->b_rptr;
409 tei->PRIM_type = T_EXDATA_IND;
410 tei->MORE_flag = 0;
411 mp->b_wptr = (uchar_t *)&tei[1];
412
413 TCP_STAT(tcps, tcp_fusion_urg);
414 TCPS_BUMP_MIB(tcps, tcpOutUrg);
415
416 head = peer_tcp->tcp_rcv_list;
417 while (head != NULL) {
418 /*
419 * Remove existing T_EXDATA_IND, keep the data which follows
420 * it and relink our list. Note that we don't modify the
421 * tcp_rcv_last_tail since it never points to T_EXDATA_IND.
422 */
423 if (DB_TYPE(head) != M_DATA) {
424 mp1 = head;
425
426 ASSERT(DB_TYPE(mp1->b_cont) == M_DATA);
427 head = mp1->b_cont;
428 mp1->b_cont = NULL;
429 head->b_next = mp1->b_next;
430 mp1->b_next = NULL;
431 if (prev_head != NULL)
432 prev_head->b_next = head;
433 if (peer_tcp->tcp_rcv_list == mp1)
434 peer_tcp->tcp_rcv_list = head;
435 if (peer_tcp->tcp_rcv_last_head == mp1)
436 peer_tcp->tcp_rcv_last_head = head;
437 freeb(mp1);
438 }
439 prev_head = head;
440 head = head->b_next;
441 }
442 }
443
444 /*
445 * Fusion output routine, called by tcp_output() and tcp_wput_proto().
446 * If we are modifying any member that can be changed outside the squeue,
447 * like tcp_flow_stopped, we need to take tcp_non_sq_lock.
448 */
449 boolean_t
tcp_fuse_output(tcp_t * tcp,mblk_t * mp,uint32_t send_size)450 tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size)
451 {
452 conn_t *connp = tcp->tcp_connp;
453 tcp_t *peer_tcp = tcp->tcp_loopback_peer;
454 conn_t *peer_connp = peer_tcp->tcp_connp;
455 boolean_t flow_stopped, peer_data_queued = B_FALSE;
456 boolean_t urgent = (DB_TYPE(mp) != M_DATA);
457 boolean_t push = B_TRUE;
458 mblk_t *mp1 = mp;
459 uint_t ip_hdr_len;
460 uint32_t recv_size = send_size;
461 tcp_stack_t *tcps = tcp->tcp_tcps;
462 netstack_t *ns = tcps->tcps_netstack;
463 ip_stack_t *ipst = ns->netstack_ip;
464 ipsec_stack_t *ipss = ns->netstack_ipsec;
465 iaflags_t ixaflags = connp->conn_ixa->ixa_flags;
466 boolean_t do_ipsec, hooks_out, hooks_in, ipobs_enabled;
467
468 ASSERT(tcp->tcp_fused);
469 ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp);
470 ASSERT(connp->conn_sqp == peer_connp->conn_sqp);
471 ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO ||
472 DB_TYPE(mp) == M_PCPROTO);
473
474 if (send_size == 0) {
475 freemsg(mp);
476 return (B_TRUE);
477 }
478
479 /*
480 * Check enforcement of the minimum TTL policy differences in the
481 * connection as this can change even after fusion. If we detect a
482 * mismatch, unfuse and allow normal stack processing to handle this.
483 */
484 if (peer_connp->conn_min_ttl != 0 && peer_connp->conn_min_ttl >
485 connp->conn_xmit_ipp.ipp_unicast_hops) {
486 goto unfuse;
487 }
488
489 /*
490 * Handle urgent data; we either send up SIGURG to the peer now
491 * or do it later when we drain, in case the peer is detached
492 * or if we're short of memory for M_PCSIG mblk.
493 */
494 if (urgent) {
495 tcp_fuse_output_urg(tcp, mp);
496
497 mp1 = mp->b_cont;
498 }
499
500 /*
501 * Check that we are still using an IRE_LOCAL or IRE_LOOPBACK before
502 * further processes.
503 */
504 if (!ip_output_verify_local(connp->conn_ixa))
505 goto unfuse;
506
507 /*
508 * Build IP and TCP header in case we have something that needs the
509 * headers. Those cases are:
510 * 1. IPsec
511 * 2. IPobs
512 * 3. FW_HOOKS
513 *
514 * If tcp_xmit_mp() fails to dupb() the message, unfuse the connection
515 * and back to regular path.
516 */
517 if (ixaflags & IXAF_IS_IPV4) {
518 do_ipsec = (ixaflags & IXAF_IPSEC_SECURE) ||
519 CONN_INBOUND_POLICY_PRESENT(peer_connp, ipss);
520
521 hooks_out = HOOKS4_INTERESTED_LOOPBACK_OUT(ipst);
522 hooks_in = HOOKS4_INTERESTED_LOOPBACK_IN(ipst);
523 ipobs_enabled = (ipst->ips_ip4_observe.he_interested != 0);
524 } else {
525 do_ipsec = (ixaflags & IXAF_IPSEC_SECURE) ||
526 CONN_INBOUND_POLICY_PRESENT_V6(peer_connp, ipss);
527
528 hooks_out = HOOKS6_INTERESTED_LOOPBACK_OUT(ipst);
529 hooks_in = HOOKS6_INTERESTED_LOOPBACK_IN(ipst);
530 ipobs_enabled = (ipst->ips_ip6_observe.he_interested != 0);
531 }
532
533 /* We do logical 'or' for efficiency */
534 if (ipobs_enabled | do_ipsec | hooks_in | hooks_out) {
535 if ((mp1 = tcp_xmit_mp(tcp, mp1, tcp->tcp_mss, NULL, NULL,
536 tcp->tcp_snxt, B_TRUE, NULL, B_FALSE)) == NULL)
537 /* If tcp_xmit_mp fails, use regular path */
538 goto unfuse;
539
540 /*
541 * Leave all IP relevant processes to ip_output_process_local(),
542 * which handles IPsec, IPobs, and FW_HOOKS.
543 */
544 mp1 = ip_output_process_local(mp1, connp->conn_ixa, hooks_out,
545 hooks_in, do_ipsec ? peer_connp : NULL);
546
547 /* If the message is dropped for any reason. */
548 if (mp1 == NULL)
549 goto unfuse;
550
551 /*
552 * Data length might have been changed by FW_HOOKS.
553 * We assume that the first mblk contains the TCP/IP headers.
554 */
555 if (hooks_in || hooks_out) {
556 tcpha_t *tcpha;
557
558 ip_hdr_len = (ixaflags & IXAF_IS_IPV4) ?
559 IPH_HDR_LENGTH((ipha_t *)mp1->b_rptr) :
560 ip_hdr_length_v6(mp1, (ip6_t *)mp1->b_rptr);
561
562 tcpha = (tcpha_t *)&mp1->b_rptr[ip_hdr_len];
563 ASSERT((uchar_t *)tcpha + sizeof (tcpha_t) <=
564 mp1->b_wptr);
565 recv_size += htonl(tcpha->tha_seq) - tcp->tcp_snxt;
566
567 }
568
569 /*
570 * The message duplicated by tcp_xmit_mp is freed.
571 * Note: the original message passed in remains unchanged.
572 */
573 freemsg(mp1);
574 }
575
576 /*
577 * Enqueue data into the peer's receive list; we may or may not
578 * drain the contents depending on the conditions below.
579 *
580 * For non-STREAMS sockets we normally queue data directly in the
581 * socket by calling the su_recv upcall. However, if the peer is
582 * detached we use tcp_rcv_enqueue() instead. Queued data will be
583 * drained when the accept completes (in tcp_accept_finish()).
584 */
585 if (IPCL_IS_NONSTR(peer_connp) &&
586 !TCP_IS_DETACHED(peer_tcp)) {
587 int error;
588 int flags = 0;
589
590 if ((tcp->tcp_valid_bits & TCP_URG_VALID) &&
591 (tcp->tcp_urg == tcp->tcp_snxt)) {
592 flags = MSG_OOB;
593 (*peer_connp->conn_upcalls->su_signal_oob)
594 (peer_connp->conn_upper_handle, 0);
595 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
596 }
597 if ((*peer_connp->conn_upcalls->su_recv)(
598 peer_connp->conn_upper_handle, mp, recv_size,
599 flags, &error, &push) < 0) {
600 ASSERT(error != EOPNOTSUPP);
601 peer_data_queued = B_TRUE;
602 }
603 } else {
604 if (IPCL_IS_NONSTR(peer_connp) &&
605 (tcp->tcp_valid_bits & TCP_URG_VALID) &&
606 (tcp->tcp_urg == tcp->tcp_snxt)) {
607 /*
608 * Can not deal with urgent pointers
609 * that arrive before the connection has been
610 * accept()ed.
611 */
612 tcp->tcp_valid_bits &= ~TCP_URG_VALID;
613 freemsg(mp);
614 return (B_TRUE);
615 }
616
617 tcp_rcv_enqueue(peer_tcp, mp, recv_size,
618 tcp->tcp_connp->conn_cred);
619
620 /* In case it wrapped around and also to keep it constant */
621 peer_tcp->tcp_rwnd += recv_size;
622 }
623
624 /*
625 * Exercise flow-control when needed; we will get back-enabled
626 * in either tcp_accept_finish(), tcp_unfuse(), or when data is
627 * consumed. If peer endpoint is detached, we emulate streams flow
628 * control by checking the peer's queue size and high water mark;
629 * otherwise we simply use canputnext() to decide if we need to stop
630 * our flow.
631 *
632 * Since we are accessing our tcp_flow_stopped and might modify it,
633 * we need to take tcp->tcp_non_sq_lock.
634 */
635 mutex_enter(&tcp->tcp_non_sq_lock);
636 flow_stopped = tcp->tcp_flow_stopped;
637 if ((TCP_IS_DETACHED(peer_tcp) &&
638 (peer_tcp->tcp_rcv_cnt >= peer_connp->conn_rcvbuf)) ||
639 (!TCP_IS_DETACHED(peer_tcp) &&
640 !IPCL_IS_NONSTR(peer_connp) && !canputnext(peer_connp->conn_rq))) {
641 peer_data_queued = B_TRUE;
642 }
643
644 if (!flow_stopped && (peer_data_queued ||
645 (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf))) {
646 tcp_setqfull(tcp);
647 flow_stopped = B_TRUE;
648 TCP_STAT(tcps, tcp_fusion_flowctl);
649 DTRACE_PROBE3(tcp__fuse__output__flowctl, tcp_t *, tcp,
650 uint_t, send_size, uint_t, peer_tcp->tcp_rcv_cnt);
651 } else if (flow_stopped && !peer_data_queued &&
652 (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat)) {
653 tcp_clrqfull(tcp);
654 TCP_STAT(tcps, tcp_fusion_backenabled);
655 flow_stopped = B_FALSE;
656 }
657 mutex_exit(&tcp->tcp_non_sq_lock);
658
659 ipst->ips_loopback_packets++;
660 tcp->tcp_last_sent_len = send_size;
661
662 /* Need to adjust the following SNMP MIB-related variables */
663 tcp->tcp_snxt += send_size;
664 tcp->tcp_suna = tcp->tcp_snxt;
665 peer_tcp->tcp_rnxt += recv_size;
666 peer_tcp->tcp_last_recv_len = recv_size;
667 peer_tcp->tcp_rack = peer_tcp->tcp_rnxt;
668
669 TCPS_BUMP_MIB(tcps, tcpOutDataSegs);
670 TCPS_BUMP_MIB(tcps, tcpHCOutSegs);
671 TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, send_size);
672 tcp->tcp_cs.tcp_out_data_bytes += send_size;
673 tcp->tcp_cs.tcp_out_data_segs++;
674
675 TCPS_BUMP_MIB(tcps, tcpHCInSegs);
676 TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs);
677 TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, send_size);
678 peer_tcp->tcp_cs.tcp_in_data_inorder_bytes += send_size;
679 peer_tcp->tcp_cs.tcp_in_data_inorder_segs++;
680
681 DTRACE_TCP5(send, void, NULL, ip_xmit_attr_t *, connp->conn_ixa,
682 __dtrace_tcp_void_ip_t *, NULL, tcp_t *, tcp,
683 __dtrace_tcp_tcph_t *, NULL);
684 DTRACE_TCP5(receive, void, NULL, ip_xmit_attr_t *,
685 peer_connp->conn_ixa, __dtrace_tcp_void_ip_t *, NULL,
686 tcp_t *, peer_tcp, __dtrace_tcp_tcph_t *, NULL);
687
688 if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
689 !TCP_IS_DETACHED(peer_tcp)) {
690 /*
691 * Drain the peer's receive queue it has urgent data or if
692 * we're not flow-controlled.
693 */
694 if (urgent || !flow_stopped) {
695 ASSERT(peer_tcp->tcp_rcv_list != NULL);
696 /*
697 * For TLI-based streams, a thread in tcp_accept_swap()
698 * can race with us. That thread will ensure that the
699 * correct peer_connp->conn_rq is globally visible
700 * before peer_tcp->tcp_detached is visible as clear,
701 * but we must also ensure that the load of conn_rq
702 * cannot be reordered to be before the tcp_detached
703 * check.
704 */
705 membar_consumer();
706 (void) tcp_fuse_rcv_drain(peer_connp->conn_rq, peer_tcp,
707 NULL);
708 }
709 }
710 return (B_TRUE);
711 unfuse:
712 tcp_unfuse(tcp);
713 return (B_FALSE);
714 }
715
716 /*
717 * This routine gets called to deliver data upstream on a fused or
718 * previously fused tcp loopback endpoint; the latter happens only
719 * when there is a pending SIGURG signal plus urgent data that can't
720 * be sent upstream in the past.
721 */
722 boolean_t
tcp_fuse_rcv_drain(queue_t * q,tcp_t * tcp,mblk_t ** sigurg_mpp)723 tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp)
724 {
725 mblk_t *mp;
726 conn_t *connp = tcp->tcp_connp;
727
728 #ifdef DEBUG
729 uint_t cnt = 0;
730 #endif
731 tcp_stack_t *tcps = tcp->tcp_tcps;
732 tcp_t *peer_tcp = tcp->tcp_loopback_peer;
733
734 ASSERT(tcp->tcp_loopback);
735 ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg);
736 ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL);
737 ASSERT(IPCL_IS_NONSTR(connp) || sigurg_mpp != NULL || tcp->tcp_fused);
738
739 /* No need for the push timer now, in case it was scheduled */
740 if (tcp->tcp_push_tid != 0) {
741 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
742 tcp->tcp_push_tid = 0;
743 }
744 /*
745 * If there's urgent data sitting in receive list and we didn't
746 * get a chance to send up a SIGURG signal, make sure we send
747 * it first before draining in order to ensure that SIOCATMARK
748 * works properly.
749 */
750 if (tcp->tcp_fused_sigurg) {
751 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp));
752
753 tcp->tcp_fused_sigurg = B_FALSE;
754 /*
755 * sigurg_mpp is normally NULL, i.e. when we're still
756 * fused and didn't get here because of tcp_unfuse().
757 * In this case try hard to allocate the M_PCSIG mblk.
758 */
759 if (sigurg_mpp == NULL &&
760 (mp = allocb(1, BPRI_HI)) == NULL &&
761 (mp = allocb_tryhard(1)) == NULL) {
762 /* Alloc failed; try again next time */
763 tcp->tcp_push_tid = TCP_TIMER(tcp,
764 tcp_push_timer, tcps->tcps_push_timer_interval);
765 return (B_TRUE);
766 } else if (sigurg_mpp != NULL) {
767 /*
768 * Use the supplied M_PCSIG mblk; it means we're
769 * either unfused or in the process of unfusing,
770 * and the drain must happen now.
771 */
772 mp = *sigurg_mpp;
773 *sigurg_mpp = NULL;
774 }
775 ASSERT(mp != NULL);
776
777 /* Send up the signal */
778 DB_TYPE(mp) = M_PCSIG;
779 *mp->b_wptr++ = (uchar_t)SIGURG;
780 putnext(q, mp);
781
782 /*
783 * Let the regular tcp_rcv_drain() path handle
784 * draining the data if we're no longer fused.
785 */
786 if (!tcp->tcp_fused)
787 return (B_FALSE);
788 }
789
790 /* Drain the data */
791 while ((mp = tcp->tcp_rcv_list) != NULL) {
792 tcp->tcp_rcv_list = mp->b_next;
793 mp->b_next = NULL;
794 #ifdef DEBUG
795 cnt += msgdsize(mp);
796 #endif
797 ASSERT(!IPCL_IS_NONSTR(connp));
798 putnext(q, mp);
799 TCP_STAT(tcps, tcp_fusion_putnext);
800 }
801
802 #ifdef DEBUG
803 ASSERT(cnt == tcp->tcp_rcv_cnt);
804 #endif
805 tcp->tcp_rcv_last_head = NULL;
806 tcp->tcp_rcv_last_tail = NULL;
807 tcp->tcp_rcv_cnt = 0;
808 tcp->tcp_rwnd = tcp->tcp_connp->conn_rcvbuf;
809
810 mutex_enter(&peer_tcp->tcp_non_sq_lock);
811 if (peer_tcp->tcp_flow_stopped && (TCP_UNSENT_BYTES(peer_tcp) <=
812 peer_tcp->tcp_connp->conn_sndlowat)) {
813 tcp_clrqfull(peer_tcp);
814 TCP_STAT(tcps, tcp_fusion_backenabled);
815 }
816 mutex_exit(&peer_tcp->tcp_non_sq_lock);
817
818 return (B_TRUE);
819 }
820
821 /*
822 * Calculate the size of receive buffer for a fused tcp endpoint.
823 */
824 size_t
tcp_fuse_set_rcv_hiwat(tcp_t * tcp,size_t rwnd)825 tcp_fuse_set_rcv_hiwat(tcp_t *tcp, size_t rwnd)
826 {
827 tcp_stack_t *tcps = tcp->tcp_tcps;
828 uint32_t max_win;
829
830 ASSERT(tcp->tcp_fused);
831
832 /* Ensure that value is within the maximum upper bound */
833 if (rwnd > tcps->tcps_max_buf)
834 rwnd = tcps->tcps_max_buf;
835 /*
836 * Round up to system page size in case SO_RCVBUF is modified
837 * after SO_SNDBUF; the latter is also similarly rounded up.
838 */
839 rwnd = P2ROUNDUP_TYPED(rwnd, PAGESIZE, size_t);
840 max_win = TCP_MAXWIN << tcp->tcp_rcv_ws;
841 if (rwnd > max_win) {
842 rwnd = max_win - (max_win % tcp->tcp_mss);
843 if (rwnd < tcp->tcp_mss)
844 rwnd = max_win;
845 }
846
847 /*
848 * Record high water mark, this is used for flow-control
849 * purposes in tcp_fuse_output().
850 */
851 tcp->tcp_connp->conn_rcvbuf = rwnd;
852 tcp->tcp_rwnd = rwnd;
853 return (rwnd);
854 }
855
856 /*
857 * Calculate the maximum outstanding unread data block for a fused tcp endpoint.
858 */
859 int
tcp_fuse_maxpsz(tcp_t * tcp)860 tcp_fuse_maxpsz(tcp_t *tcp)
861 {
862 tcp_t *peer_tcp = tcp->tcp_loopback_peer;
863 conn_t *connp = tcp->tcp_connp;
864 uint_t sndbuf = connp->conn_sndbuf;
865 uint_t maxpsz = sndbuf;
866
867 ASSERT(tcp->tcp_fused);
868 ASSERT(peer_tcp != NULL);
869 ASSERT(peer_tcp->tcp_connp->conn_rcvbuf != 0);
870 /*
871 * In the fused loopback case, we want the stream head to split
872 * up larger writes into smaller chunks for a more accurate flow-
873 * control accounting. Our maxpsz is half of the sender's send
874 * buffer or the receiver's receive buffer, whichever is smaller.
875 * We round up the buffer to system page size due to the lack of
876 * TCP MSS concept in Fusion.
877 */
878 if (maxpsz > peer_tcp->tcp_connp->conn_rcvbuf)
879 maxpsz = peer_tcp->tcp_connp->conn_rcvbuf;
880 maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1;
881
882 return (maxpsz);
883 }
884
885 /*
886 * Called to release flow control.
887 */
888 void
tcp_fuse_backenable(tcp_t * tcp)889 tcp_fuse_backenable(tcp_t *tcp)
890 {
891 tcp_t *peer_tcp = tcp->tcp_loopback_peer;
892
893 ASSERT(tcp->tcp_fused);
894 ASSERT(peer_tcp != NULL && peer_tcp->tcp_fused);
895 ASSERT(peer_tcp->tcp_loopback_peer == tcp);
896 ASSERT(!TCP_IS_DETACHED(tcp));
897 ASSERT(tcp->tcp_connp->conn_sqp ==
898 peer_tcp->tcp_connp->conn_sqp);
899
900 if (tcp->tcp_rcv_list != NULL)
901 (void) tcp_fuse_rcv_drain(tcp->tcp_connp->conn_rq, tcp, NULL);
902
903 mutex_enter(&peer_tcp->tcp_non_sq_lock);
904 if (peer_tcp->tcp_flow_stopped &&
905 (TCP_UNSENT_BYTES(peer_tcp) <=
906 peer_tcp->tcp_connp->conn_sndlowat)) {
907 tcp_clrqfull(peer_tcp);
908 }
909 mutex_exit(&peer_tcp->tcp_non_sq_lock);
910
911 TCP_STAT(tcp->tcp_tcps, tcp_fusion_backenabled);
912 }
913