1 /*- 2 * Copyright (c) 2016-2018 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 #include "opt_inet.h" 30 #include "opt_inet6.h" 31 #include "opt_rss.h" 32 #include "opt_tcpdebug.h" 33 34 /** 35 * Some notes about usage. 36 * 37 * The tcp_hpts system is designed to provide a high precision timer 38 * system for tcp. Its main purpose is to provide a mechanism for 39 * pacing packets out onto the wire. It can be used in two ways 40 * by a given TCP stack (and those two methods can be used simultaneously). 41 * 42 * First, and probably the main thing its used by Rack and BBR, it can 43 * be used to call tcp_output() of a transport stack at some time in the future. 44 * The normal way this is done is that tcp_output() of the stack schedules 45 * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The 46 * slot is the time from now that the stack wants to be called but it 47 * must be converted to tcp_hpts's notion of slot. This is done with 48 * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical 49 * call from the tcp_output() routine might look like: 50 * 51 * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); 52 * 53 * The above would schedule tcp_ouput() to be called in 550 useconds. 54 * Note that if using this mechanism the stack will want to add near 55 * its top a check to prevent unwanted calls (from user land or the 56 * arrival of incoming ack's). So it would add something like: 57 * 58 * if (inp->inp_in_hpts) 59 * return; 60 * 61 * to prevent output processing until the time alotted has gone by. 62 * Of course this is a bare bones example and the stack will probably 63 * have more consideration then just the above. 64 * 65 * Now the second function (actually two functions I guess :D) 66 * the tcp_hpts system provides is the ability to either abort 67 * a connection (later) or process input on a connection. 68 * Why would you want to do this? To keep processor locality 69 * and or not have to worry about untangling any recursive 70 * locks. The input function now is hooked to the new LRO 71 * system as well. 72 * 73 * In order to use the input redirection function the 74 * tcp stack must define an input function for 75 * tfb_do_queued_segments(). This function understands 76 * how to dequeue a array of packets that were input and 77 * knows how to call the correct processing routine. 78 * 79 * Locking in this is important as well so most likely the 80 * stack will need to define the tfb_do_segment_nounlock() 81 * splitting tfb_do_segment() into two parts. The main processing 82 * part that does not unlock the INP and returns a value of 1 or 0. 83 * It returns 0 if all is well and the lock was not released. It 84 * returns 1 if we had to destroy the TCB (a reset received etc). 85 * The remains of tfb_do_segment() then become just a simple call 86 * to the tfb_do_segment_nounlock() function and check the return 87 * code and possibly unlock. 88 * 89 * The stack must also set the flag on the INP that it supports this 90 * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes 91 * this flag as well and will queue packets when it is set. 92 * There are other flags as well INP_MBUF_QUEUE_READY and 93 * INP_DONT_SACK_QUEUE. The first flag tells the LRO code 94 * that we are in the pacer for output so there is no 95 * need to wake up the hpts system to get immediate 96 * input. The second tells the LRO code that its okay 97 * if a SACK arrives you can still defer input and let 98 * the current hpts timer run (this is usually set when 99 * a rack timer is up so we know SACK's are happening 100 * on the connection already and don't want to wakeup yet). 101 * 102 * There is a common functions within the rack_bbr_common code 103 * version i.e. ctf_do_queued_segments(). This function 104 * knows how to take the input queue of packets from 105 * tp->t_in_pkts and process them digging out 106 * all the arguments, calling any bpf tap and 107 * calling into tfb_do_segment_nounlock(). The common 108 * function (ctf_do_queued_segments()) requires that 109 * you have defined the tfb_do_segment_nounlock() as 110 * described above. 111 * 112 * The second feature of the input side of hpts is the 113 * dropping of a connection. This is due to the way that 114 * locking may have occured on the INP_WLOCK. So if 115 * a stack wants to drop a connection it calls: 116 * 117 * tcp_set_inp_to_drop(tp, ETIMEDOUT) 118 * 119 * To schedule the tcp_hpts system to call 120 * 121 * tcp_drop(tp, drop_reason) 122 * 123 * at a future point. This is quite handy to prevent locking 124 * issues when dropping connections. 125 * 126 */ 127 128 #include <sys/param.h> 129 #include <sys/bus.h> 130 #include <sys/interrupt.h> 131 #include <sys/module.h> 132 #include <sys/kernel.h> 133 #include <sys/hhook.h> 134 #include <sys/malloc.h> 135 #include <sys/mbuf.h> 136 #include <sys/proc.h> /* for proc0 declaration */ 137 #include <sys/socket.h> 138 #include <sys/socketvar.h> 139 #include <sys/sysctl.h> 140 #include <sys/systm.h> 141 #include <sys/refcount.h> 142 #include <sys/sched.h> 143 #include <sys/queue.h> 144 #include <sys/smp.h> 145 #include <sys/counter.h> 146 #include <sys/time.h> 147 #include <sys/kthread.h> 148 #include <sys/kern_prefetch.h> 149 150 #include <vm/uma.h> 151 #include <vm/vm.h> 152 153 #include <net/route.h> 154 #include <net/vnet.h> 155 156 #ifdef RSS 157 #include <net/netisr.h> 158 #include <net/rss_config.h> 159 #endif 160 161 #define TCPSTATES /* for logging */ 162 163 #include <netinet/in.h> 164 #include <netinet/in_kdtrace.h> 165 #include <netinet/in_pcb.h> 166 #include <netinet/ip.h> 167 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 168 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 169 #include <netinet/ip_var.h> 170 #include <netinet/ip6.h> 171 #include <netinet6/in6_pcb.h> 172 #include <netinet6/ip6_var.h> 173 #include <netinet/tcp.h> 174 #include <netinet/tcp_fsm.h> 175 #include <netinet/tcp_seq.h> 176 #include <netinet/tcp_timer.h> 177 #include <netinet/tcp_var.h> 178 #include <netinet/tcpip.h> 179 #include <netinet/cc/cc.h> 180 #include <netinet/tcp_hpts.h> 181 #include <netinet/tcp_log_buf.h> 182 183 #ifdef tcpdebug 184 #include <netinet/tcp_debug.h> 185 #endif /* tcpdebug */ 186 #ifdef tcp_offload 187 #include <netinet/tcp_offload.h> 188 #endif 189 190 191 MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); 192 #ifdef RSS 193 static int tcp_bind_threads = 1; 194 #else 195 static int tcp_bind_threads = 2; 196 #endif 197 TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); 198 199 static struct tcp_hptsi tcp_pace; 200 static int hpts_does_tp_logging = 0; 201 202 static void tcp_wakehpts(struct tcp_hpts_entry *p); 203 static void tcp_wakeinput(struct tcp_hpts_entry *p); 204 static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); 205 static void tcp_hptsi(struct tcp_hpts_entry *hpts); 206 static void tcp_hpts_thread(void *ctx); 207 static void tcp_init_hptsi(void *st); 208 209 int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; 210 static int32_t tcp_hpts_callout_skip_swi = 0; 211 212 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 213 "TCP Hpts controls"); 214 215 #define timersub(tvp, uvp, vvp) \ 216 do { \ 217 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 218 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 219 if ((vvp)->tv_usec < 0) { \ 220 (vvp)->tv_sec--; \ 221 (vvp)->tv_usec += 1000000; \ 222 } \ 223 } while (0) 224 225 static int32_t tcp_hpts_precision = 120; 226 227 struct hpts_domain_info { 228 int count; 229 int cpu[MAXCPU]; 230 }; 231 232 struct hpts_domain_info hpts_domains[MAXMEMDOM]; 233 234 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, 235 &tcp_hpts_precision, 120, 236 "Value for PRE() precision of callout"); 237 238 counter_u64_t hpts_hopelessly_behind; 239 240 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD, 241 &hpts_hopelessly_behind, 242 "Number of times hpts could not catch up and was behind hopelessly"); 243 244 counter_u64_t hpts_loops; 245 246 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD, 247 &hpts_loops, "Number of times hpts had to loop to catch up"); 248 249 250 counter_u64_t back_tosleep; 251 252 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD, 253 &back_tosleep, "Number of times hpts found no tcbs"); 254 255 counter_u64_t combined_wheel_wrap; 256 257 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD, 258 &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); 259 260 counter_u64_t wheel_wrap; 261 262 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD, 263 &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); 264 265 static int32_t out_ts_percision = 0; 266 267 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW, 268 &out_ts_percision, 0, 269 "Do we use a percise timestamp for every output cts"); 270 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, 271 &hpts_does_tp_logging, 0, 272 "Do we add to any tp that has logging on pacer logs"); 273 274 static int32_t max_pacer_loops = 10; 275 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW, 276 &max_pacer_loops, 10, 277 "What is the maximum number of times the pacer will loop trying to catch up"); 278 279 #define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2) 280 281 static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED; 282 283 284 static int 285 sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS) 286 { 287 int error; 288 uint32_t new; 289 290 new = hpts_sleep_max; 291 error = sysctl_handle_int(oidp, &new, 0, req); 292 if (error == 0 && req->newptr) { 293 if ((new < (NUM_OF_HPTSI_SLOTS / 4)) || 294 (new > HPTS_MAX_SLEEP_ALLOWED)) 295 error = EINVAL; 296 else 297 hpts_sleep_max = new; 298 } 299 return (error); 300 } 301 302 SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep, 303 CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 304 &hpts_sleep_max, 0, 305 &sysctl_net_inet_tcp_hpts_max_sleep, "IU", 306 "Maximum time hpts will sleep"); 307 308 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW, 309 &tcp_min_hptsi_time, 0, 310 "The minimum time the hpts must sleep before processing more slots"); 311 312 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW, 313 &tcp_hpts_callout_skip_swi, 0, 314 "Do we have the callout call directly to the hpts?"); 315 316 static void 317 tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, 318 int ticks_to_run, int idx) 319 { 320 union tcp_log_stackspecific log; 321 322 memset(&log.u_bbr, 0, sizeof(log.u_bbr)); 323 log.u_bbr.flex1 = hpts->p_nxt_slot; 324 log.u_bbr.flex2 = hpts->p_cur_slot; 325 log.u_bbr.flex3 = hpts->p_prev_slot; 326 log.u_bbr.flex4 = idx; 327 log.u_bbr.flex5 = hpts->p_curtick; 328 log.u_bbr.flex6 = hpts->p_on_queue_cnt; 329 log.u_bbr.use_lt_bw = 1; 330 log.u_bbr.inflight = ticks_to_run; 331 log.u_bbr.applimited = hpts->overidden_sleep; 332 log.u_bbr.delivered = hpts->saved_curtick; 333 log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); 334 log.u_bbr.epoch = hpts->saved_curslot; 335 log.u_bbr.lt_epoch = hpts->saved_prev_slot; 336 log.u_bbr.pkts_out = hpts->p_delayed_by; 337 log.u_bbr.lost = hpts->p_hpts_sleep_time; 338 log.u_bbr.cur_del_rate = hpts->p_runningtick; 339 TCP_LOG_EVENTP(tp, NULL, 340 &tp->t_inpcb->inp_socket->so_rcv, 341 &tp->t_inpcb->inp_socket->so_snd, 342 BBR_LOG_HPTSDIAG, 0, 343 0, &log, false, tv); 344 } 345 346 static void 347 hpts_timeout_swi(void *arg) 348 { 349 struct tcp_hpts_entry *hpts; 350 351 hpts = (struct tcp_hpts_entry *)arg; 352 swi_sched(hpts->ie_cookie, 0); 353 } 354 355 static void 356 hpts_timeout_dir(void *arg) 357 { 358 tcp_hpts_thread(arg); 359 } 360 361 static inline void 362 hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear) 363 { 364 #ifdef INVARIANTS 365 if (mtx_owned(&hpts->p_mtx) == 0) { 366 /* We don't own the mutex? */ 367 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 368 } 369 if (hpts->p_cpu != inp->inp_hpts_cpu) { 370 /* It is not the right cpu/mutex? */ 371 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 372 } 373 if (inp->inp_in_hpts == 0) { 374 /* We are not on the hpts? */ 375 panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp); 376 } 377 #endif 378 TAILQ_REMOVE(head, inp, inp_hpts); 379 hpts->p_on_queue_cnt--; 380 if (hpts->p_on_queue_cnt < 0) { 381 /* Count should not go negative .. */ 382 #ifdef INVARIANTS 383 panic("Hpts goes negative inp:%p hpts:%p", 384 inp, hpts); 385 #endif 386 hpts->p_on_queue_cnt = 0; 387 } 388 if (clear) { 389 inp->inp_hpts_request = 0; 390 inp->inp_in_hpts = 0; 391 } 392 } 393 394 static inline void 395 hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref) 396 { 397 #ifdef INVARIANTS 398 if (mtx_owned(&hpts->p_mtx) == 0) { 399 /* We don't own the mutex? */ 400 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 401 } 402 if (hpts->p_cpu != inp->inp_hpts_cpu) { 403 /* It is not the right cpu/mutex? */ 404 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 405 } 406 if ((noref == 0) && (inp->inp_in_hpts == 1)) { 407 /* We are already on the hpts? */ 408 panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp); 409 } 410 #endif 411 TAILQ_INSERT_TAIL(head, inp, inp_hpts); 412 inp->inp_in_hpts = 1; 413 hpts->p_on_queue_cnt++; 414 if (noref == 0) { 415 in_pcbref(inp); 416 } 417 } 418 419 static inline void 420 hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear) 421 { 422 #ifdef INVARIANTS 423 if (mtx_owned(&hpts->p_mtx) == 0) { 424 /* We don't own the mutex? */ 425 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 426 } 427 if (hpts->p_cpu != inp->inp_input_cpu) { 428 /* It is not the right cpu/mutex? */ 429 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 430 } 431 if (inp->inp_in_input == 0) { 432 /* We are not on the input hpts? */ 433 panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp); 434 } 435 #endif 436 TAILQ_REMOVE(&hpts->p_input, inp, inp_input); 437 hpts->p_on_inqueue_cnt--; 438 if (hpts->p_on_inqueue_cnt < 0) { 439 #ifdef INVARIANTS 440 panic("Hpts in goes negative inp:%p hpts:%p", 441 inp, hpts); 442 #endif 443 hpts->p_on_inqueue_cnt = 0; 444 } 445 #ifdef INVARIANTS 446 if (TAILQ_EMPTY(&hpts->p_input) && 447 (hpts->p_on_inqueue_cnt != 0)) { 448 /* We should not be empty with a queue count */ 449 panic("%s hpts:%p in_hpts input empty but cnt:%d", 450 __FUNCTION__, hpts, hpts->p_on_inqueue_cnt); 451 } 452 #endif 453 if (clear) 454 inp->inp_in_input = 0; 455 } 456 457 static inline void 458 hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line) 459 { 460 #ifdef INVARIANTS 461 if (mtx_owned(&hpts->p_mtx) == 0) { 462 /* We don't own the mutex? */ 463 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 464 } 465 if (hpts->p_cpu != inp->inp_input_cpu) { 466 /* It is not the right cpu/mutex? */ 467 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 468 } 469 if (inp->inp_in_input == 1) { 470 /* We are already on the input hpts? */ 471 panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp); 472 } 473 #endif 474 TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input); 475 inp->inp_in_input = 1; 476 hpts->p_on_inqueue_cnt++; 477 in_pcbref(inp); 478 } 479 480 static void 481 tcp_wakehpts(struct tcp_hpts_entry *hpts) 482 { 483 HPTS_MTX_ASSERT(hpts); 484 if (hpts->p_hpts_wake_scheduled == 0) { 485 hpts->p_hpts_wake_scheduled = 1; 486 swi_sched(hpts->ie_cookie, 0); 487 } 488 } 489 490 static void 491 tcp_wakeinput(struct tcp_hpts_entry *hpts) 492 { 493 HPTS_MTX_ASSERT(hpts); 494 if (hpts->p_hpts_wake_scheduled == 0) { 495 hpts->p_hpts_wake_scheduled = 1; 496 swi_sched(hpts->ie_cookie, 0); 497 } 498 } 499 500 struct tcp_hpts_entry * 501 tcp_cur_hpts(struct inpcb *inp) 502 { 503 int32_t hpts_num; 504 struct tcp_hpts_entry *hpts; 505 506 hpts_num = inp->inp_hpts_cpu; 507 hpts = tcp_pace.rp_ent[hpts_num]; 508 return (hpts); 509 } 510 511 struct tcp_hpts_entry * 512 tcp_hpts_lock(struct inpcb *inp) 513 { 514 struct tcp_hpts_entry *hpts; 515 int32_t hpts_num; 516 517 again: 518 hpts_num = inp->inp_hpts_cpu; 519 hpts = tcp_pace.rp_ent[hpts_num]; 520 #ifdef INVARIANTS 521 if (mtx_owned(&hpts->p_mtx)) { 522 panic("Hpts:%p owns mtx prior-to lock line:%d", 523 hpts, __LINE__); 524 } 525 #endif 526 mtx_lock(&hpts->p_mtx); 527 if (hpts_num != inp->inp_hpts_cpu) { 528 mtx_unlock(&hpts->p_mtx); 529 goto again; 530 } 531 return (hpts); 532 } 533 534 struct tcp_hpts_entry * 535 tcp_input_lock(struct inpcb *inp) 536 { 537 struct tcp_hpts_entry *hpts; 538 int32_t hpts_num; 539 540 again: 541 hpts_num = inp->inp_input_cpu; 542 hpts = tcp_pace.rp_ent[hpts_num]; 543 #ifdef INVARIANTS 544 if (mtx_owned(&hpts->p_mtx)) { 545 panic("Hpts:%p owns mtx prior-to lock line:%d", 546 hpts, __LINE__); 547 } 548 #endif 549 mtx_lock(&hpts->p_mtx); 550 if (hpts_num != inp->inp_input_cpu) { 551 mtx_unlock(&hpts->p_mtx); 552 goto again; 553 } 554 return (hpts); 555 } 556 557 static void 558 tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line) 559 { 560 int32_t add_freed; 561 562 if (inp->inp_flags2 & INP_FREED) { 563 /* 564 * Need to play a special trick so that in_pcbrele_wlocked 565 * does not return 1 when it really should have returned 0. 566 */ 567 add_freed = 1; 568 inp->inp_flags2 &= ~INP_FREED; 569 } else { 570 add_freed = 0; 571 } 572 #ifndef INP_REF_DEBUG 573 if (in_pcbrele_wlocked(inp)) { 574 /* 575 * This should not happen. We have the inpcb referred to by 576 * the main socket (why we are called) and the hpts. It 577 * should always return 0. 578 */ 579 panic("inpcb:%p release ret 1", 580 inp); 581 } 582 #else 583 if (__in_pcbrele_wlocked(inp, line)) { 584 /* 585 * This should not happen. We have the inpcb referred to by 586 * the main socket (why we are called) and the hpts. It 587 * should always return 0. 588 */ 589 panic("inpcb:%p release ret 1", 590 inp); 591 } 592 #endif 593 if (add_freed) { 594 inp->inp_flags2 |= INP_FREED; 595 } 596 } 597 598 static void 599 tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) 600 { 601 if (inp->inp_in_hpts) { 602 hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1); 603 tcp_remove_hpts_ref(inp, hpts, line); 604 } 605 } 606 607 static void 608 tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) 609 { 610 HPTS_MTX_ASSERT(hpts); 611 if (inp->inp_in_input) { 612 hpts_sane_input_remove(hpts, inp, 1); 613 tcp_remove_hpts_ref(inp, hpts, line); 614 } 615 } 616 617 /* 618 * Called normally with the INP_LOCKED but it 619 * does not matter, the hpts lock is the key 620 * but the lock order allows us to hold the 621 * INP lock and then get the hpts lock. 622 * 623 * Valid values in the flags are 624 * HPTS_REMOVE_OUTPUT - remove from the output of the hpts. 625 * HPTS_REMOVE_INPUT - remove from the input of the hpts. 626 * Note that you can use one or both values together 627 * and get two actions. 628 */ 629 void 630 __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line) 631 { 632 struct tcp_hpts_entry *hpts; 633 634 INP_WLOCK_ASSERT(inp); 635 if (flags & HPTS_REMOVE_OUTPUT) { 636 hpts = tcp_hpts_lock(inp); 637 tcp_hpts_remove_locked_output(hpts, inp, flags, line); 638 mtx_unlock(&hpts->p_mtx); 639 } 640 if (flags & HPTS_REMOVE_INPUT) { 641 hpts = tcp_input_lock(inp); 642 tcp_hpts_remove_locked_input(hpts, inp, flags, line); 643 mtx_unlock(&hpts->p_mtx); 644 } 645 } 646 647 static inline int 648 hpts_tick(uint32_t wheel_tick, uint32_t plus) 649 { 650 /* 651 * Given a slot on the wheel, what slot 652 * is that plus ticks out? 653 */ 654 KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick)); 655 return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS); 656 } 657 658 static inline int 659 tick_to_wheel(uint32_t cts_in_wticks) 660 { 661 /* 662 * Given a timestamp in wheel ticks (10usec inc's) 663 * map it to our limited space wheel. 664 */ 665 return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); 666 } 667 668 static inline int 669 hpts_ticks_diff(int prev_tick, int tick_now) 670 { 671 /* 672 * Given two ticks that are someplace 673 * on our wheel. How far are they apart? 674 */ 675 if (tick_now > prev_tick) 676 return (tick_now - prev_tick); 677 else if (tick_now == prev_tick) 678 /* 679 * Special case, same means we can go all of our 680 * wheel less one slot. 681 */ 682 return (NUM_OF_HPTSI_SLOTS - 1); 683 else 684 return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now); 685 } 686 687 /* 688 * Given a tick on the wheel that is the current time 689 * mapped to the wheel (wheel_tick), what is the maximum 690 * distance forward that can be obtained without 691 * wrapping past either prev_tick or running_tick 692 * depending on the htps state? Also if passed 693 * a uint32_t *, fill it with the tick location. 694 * 695 * Note if you do not give this function the current 696 * time (that you think it is) mapped to the wheel 697 * then the results will not be what you expect and 698 * could lead to invalid inserts. 699 */ 700 static inline int32_t 701 max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick) 702 { 703 uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel; 704 705 if ((hpts->p_hpts_active == 1) && 706 (hpts->p_wheel_complete == 0)) { 707 end_tick = hpts->p_runningtick; 708 /* Back up one tick */ 709 if (end_tick == 0) 710 end_tick = NUM_OF_HPTSI_SLOTS - 1; 711 else 712 end_tick--; 713 if (target_tick) 714 *target_tick = end_tick; 715 } else { 716 /* 717 * For the case where we are 718 * not active, or we have 719 * completed the pass over 720 * the wheel, we can use the 721 * prev tick and subtract one from it. This puts us 722 * as far out as possible on the wheel. 723 */ 724 end_tick = hpts->p_prev_slot; 725 if (end_tick == 0) 726 end_tick = NUM_OF_HPTSI_SLOTS - 1; 727 else 728 end_tick--; 729 if (target_tick) 730 *target_tick = end_tick; 731 /* 732 * Now we have close to the full wheel left minus the 733 * time it has been since the pacer went to sleep. Note 734 * that wheel_tick, passed in, should be the current time 735 * from the perspective of the caller, mapped to the wheel. 736 */ 737 if (hpts->p_prev_slot != wheel_tick) 738 dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick); 739 else 740 dis_to_travel = 1; 741 /* 742 * dis_to_travel in this case is the space from when the 743 * pacer stopped (p_prev_slot) and where our wheel_tick 744 * is now. To know how many slots we can put it in we 745 * subtract from the wheel size. We would not want 746 * to place something after p_prev_slot or it will 747 * get ran too soon. 748 */ 749 return (NUM_OF_HPTSI_SLOTS - dis_to_travel); 750 } 751 /* 752 * So how many slots are open between p_runningtick -> p_cur_slot 753 * that is what is currently un-available for insertion. Special 754 * case when we are at the last slot, this gets 1, so that 755 * the answer to how many slots are available is all but 1. 756 */ 757 if (hpts->p_runningtick == hpts->p_cur_slot) 758 dis_to_travel = 1; 759 else 760 dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot); 761 /* 762 * How long has the pacer been running? 763 */ 764 if (hpts->p_cur_slot != wheel_tick) { 765 /* The pacer is a bit late */ 766 pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick); 767 } else { 768 /* The pacer is right on time, now == pacers start time */ 769 pacer_to_now = 0; 770 } 771 /* 772 * To get the number left we can insert into we simply 773 * subract the distance the pacer has to run from how 774 * many slots there are. 775 */ 776 avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel; 777 /* 778 * Now how many of those we will eat due to the pacer's 779 * time (p_cur_slot) of start being behind the 780 * real time (wheel_tick)? 781 */ 782 if (avail_on_wheel <= pacer_to_now) { 783 /* 784 * Wheel wrap, we can't fit on the wheel, that 785 * is unusual the system must be way overloaded! 786 * Insert into the assured tick, and return special 787 * "0". 788 */ 789 counter_u64_add(combined_wheel_wrap, 1); 790 *target_tick = hpts->p_nxt_slot; 791 return (0); 792 } else { 793 /* 794 * We know how many slots are open 795 * on the wheel (the reverse of what 796 * is left to run. Take away the time 797 * the pacer started to now (wheel_tick) 798 * and that tells you how many slots are 799 * open that can be inserted into that won't 800 * be touched by the pacer until later. 801 */ 802 return (avail_on_wheel - pacer_to_now); 803 } 804 } 805 806 static int 807 tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref) 808 { 809 uint32_t need_wake = 0; 810 811 HPTS_MTX_ASSERT(hpts); 812 if (inp->inp_in_hpts == 0) { 813 /* Ok we need to set it on the hpts in the current slot */ 814 inp->inp_hpts_request = 0; 815 if ((hpts->p_hpts_active == 0) || 816 (hpts->p_wheel_complete)) { 817 /* 818 * A sleeping hpts we want in next slot to run 819 * note that in this state p_prev_slot == p_cur_slot 820 */ 821 inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1); 822 if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0)) 823 need_wake = 1; 824 } else if ((void *)inp == hpts->p_inp) { 825 /* 826 * The hpts system is running and the caller 827 * was awoken by the hpts system. 828 * We can't allow you to go into the same slot we 829 * are in (we don't want a loop :-D). 830 */ 831 inp->inp_hptsslot = hpts->p_nxt_slot; 832 } else 833 inp->inp_hptsslot = hpts->p_runningtick; 834 hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); 835 if (need_wake) { 836 /* 837 * Activate the hpts if it is sleeping and its 838 * timeout is not 1. 839 */ 840 hpts->p_direct_wake = 1; 841 tcp_wakehpts(hpts); 842 } 843 } 844 return (need_wake); 845 } 846 847 int 848 __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line) 849 { 850 int32_t ret; 851 struct tcp_hpts_entry *hpts; 852 853 INP_WLOCK_ASSERT(inp); 854 hpts = tcp_hpts_lock(inp); 855 ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); 856 mtx_unlock(&hpts->p_mtx); 857 return (ret); 858 } 859 860 #ifdef INVARIANTS 861 static void 862 check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line) 863 { 864 /* 865 * Sanity checks for the pacer with invariants 866 * on insert. 867 */ 868 if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS) 869 panic("hpts:%p inp:%p slot:%d > max", 870 hpts, inp, inp_hptsslot); 871 if ((hpts->p_hpts_active) && 872 (hpts->p_wheel_complete == 0)) { 873 /* 874 * If the pacer is processing a arc 875 * of the wheel, we need to make 876 * sure we are not inserting within 877 * that arc. 878 */ 879 int distance, yet_to_run; 880 881 distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot); 882 if (hpts->p_runningtick != hpts->p_cur_slot) 883 yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot); 884 else 885 yet_to_run = 0; /* processing last slot */ 886 if (yet_to_run > distance) { 887 panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d", 888 hpts, inp, inp_hptsslot, 889 distance, yet_to_run, 890 hpts->p_runningtick, hpts->p_cur_slot); 891 } 892 } 893 } 894 #endif 895 896 static void 897 tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line, 898 struct hpts_diag *diag, struct timeval *tv) 899 { 900 uint32_t need_new_to = 0; 901 uint32_t wheel_cts, last_tick; 902 int32_t wheel_tick, maxticks; 903 int8_t need_wakeup = 0; 904 905 HPTS_MTX_ASSERT(hpts); 906 if (diag) { 907 memset(diag, 0, sizeof(struct hpts_diag)); 908 diag->p_hpts_active = hpts->p_hpts_active; 909 diag->p_prev_slot = hpts->p_prev_slot; 910 diag->p_runningtick = hpts->p_runningtick; 911 diag->p_nxt_slot = hpts->p_nxt_slot; 912 diag->p_cur_slot = hpts->p_cur_slot; 913 diag->p_curtick = hpts->p_curtick; 914 diag->p_lasttick = hpts->p_lasttick; 915 diag->slot_req = slot; 916 diag->p_on_min_sleep = hpts->p_on_min_sleep; 917 diag->hpts_sleep_time = hpts->p_hpts_sleep_time; 918 } 919 if (inp->inp_in_hpts == 0) { 920 if (slot == 0) { 921 /* Immediate */ 922 tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); 923 return; 924 } 925 /* Get the current time relative to the wheel */ 926 wheel_cts = tcp_tv_to_hptstick(tv); 927 /* Map it onto the wheel */ 928 wheel_tick = tick_to_wheel(wheel_cts); 929 /* Now what's the max we can place it at? */ 930 maxticks = max_ticks_available(hpts, wheel_tick, &last_tick); 931 if (diag) { 932 diag->wheel_tick = wheel_tick; 933 diag->maxticks = maxticks; 934 diag->wheel_cts = wheel_cts; 935 } 936 if (maxticks == 0) { 937 /* The pacer is in a wheel wrap behind, yikes! */ 938 if (slot > 1) { 939 /* 940 * Reduce by 1 to prevent a forever loop in 941 * case something else is wrong. Note this 942 * probably does not hurt because the pacer 943 * if its true is so far behind we will be 944 * > 1second late calling anyway. 945 */ 946 slot--; 947 } 948 inp->inp_hptsslot = last_tick; 949 inp->inp_hpts_request = slot; 950 } else if (maxticks >= slot) { 951 /* It all fits on the wheel */ 952 inp->inp_hpts_request = 0; 953 inp->inp_hptsslot = hpts_tick(wheel_tick, slot); 954 } else { 955 /* It does not fit */ 956 inp->inp_hpts_request = slot - maxticks; 957 inp->inp_hptsslot = last_tick; 958 } 959 if (diag) { 960 diag->slot_remaining = inp->inp_hpts_request; 961 diag->inp_hptsslot = inp->inp_hptsslot; 962 } 963 #ifdef INVARIANTS 964 check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line); 965 #endif 966 hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0); 967 if ((hpts->p_hpts_active == 0) && 968 (inp->inp_hpts_request == 0) && 969 (hpts->p_on_min_sleep == 0)) { 970 /* 971 * The hpts is sleeping and not on a minimum 972 * sleep time, we need to figure out where 973 * it will wake up at and if we need to reschedule 974 * its time-out. 975 */ 976 uint32_t have_slept, yet_to_sleep; 977 978 /* Now do we need to restart the hpts's timer? */ 979 have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick); 980 if (have_slept < hpts->p_hpts_sleep_time) 981 yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; 982 else { 983 /* We are over-due */ 984 yet_to_sleep = 0; 985 need_wakeup = 1; 986 } 987 if (diag) { 988 diag->have_slept = have_slept; 989 diag->yet_to_sleep = yet_to_sleep; 990 } 991 if (yet_to_sleep && 992 (yet_to_sleep > slot)) { 993 /* 994 * We need to reschedule the hpts's time-out. 995 */ 996 hpts->p_hpts_sleep_time = slot; 997 need_new_to = slot * HPTS_TICKS_PER_USEC; 998 } 999 } 1000 /* 1001 * Now how far is the hpts sleeping to? if active is 1, its 1002 * up and ticking we do nothing, otherwise we may need to 1003 * reschedule its callout if need_new_to is set from above. 1004 */ 1005 if (need_wakeup) { 1006 hpts->p_direct_wake = 1; 1007 tcp_wakehpts(hpts); 1008 if (diag) { 1009 diag->need_new_to = 0; 1010 diag->co_ret = 0xffff0000; 1011 } 1012 } else if (need_new_to) { 1013 int32_t co_ret; 1014 struct timeval tv; 1015 sbintime_t sb; 1016 1017 tv.tv_sec = 0; 1018 tv.tv_usec = 0; 1019 while (need_new_to > HPTS_USEC_IN_SEC) { 1020 tv.tv_sec++; 1021 need_new_to -= HPTS_USEC_IN_SEC; 1022 } 1023 tv.tv_usec = need_new_to; 1024 sb = tvtosbt(tv); 1025 if (tcp_hpts_callout_skip_swi == 0) { 1026 co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, 1027 hpts_timeout_swi, hpts, hpts->p_cpu, 1028 (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 1029 } else { 1030 co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, 1031 hpts_timeout_dir, hpts, 1032 hpts->p_cpu, 1033 C_PREL(tcp_hpts_precision)); 1034 } 1035 if (diag) { 1036 diag->need_new_to = need_new_to; 1037 diag->co_ret = co_ret; 1038 } 1039 } 1040 } else { 1041 #ifdef INVARIANTS 1042 panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp); 1043 #endif 1044 } 1045 } 1046 1047 uint32_t 1048 tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag) 1049 { 1050 struct tcp_hpts_entry *hpts; 1051 uint32_t slot_on; 1052 struct timeval tv; 1053 1054 /* 1055 * We now return the next-slot the hpts will be on, beyond its 1056 * current run (if up) or where it was when it stopped if it is 1057 * sleeping. 1058 */ 1059 INP_WLOCK_ASSERT(inp); 1060 hpts = tcp_hpts_lock(inp); 1061 microuptime(&tv); 1062 tcp_hpts_insert_locked(hpts, inp, slot, line, diag, &tv); 1063 slot_on = hpts->p_nxt_slot; 1064 mtx_unlock(&hpts->p_mtx); 1065 return (slot_on); 1066 } 1067 1068 uint32_t 1069 __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){ 1070 return (tcp_hpts_insert_diag(inp, slot, line, NULL)); 1071 } 1072 int 1073 __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line) 1074 { 1075 int32_t retval = 0; 1076 1077 HPTS_MTX_ASSERT(hpts); 1078 if (inp->inp_in_input == 0) { 1079 /* Ok we need to set it on the hpts in the current slot */ 1080 hpts_sane_input_insert(hpts, inp, line); 1081 retval = 1; 1082 if (hpts->p_hpts_active == 0) { 1083 /* 1084 * Activate the hpts if it is sleeping. 1085 */ 1086 retval = 2; 1087 hpts->p_direct_wake = 1; 1088 tcp_wakeinput(hpts); 1089 } 1090 } else if (hpts->p_hpts_active == 0) { 1091 retval = 4; 1092 hpts->p_direct_wake = 1; 1093 tcp_wakeinput(hpts); 1094 } 1095 return (retval); 1096 } 1097 1098 int32_t 1099 __tcp_queue_to_input(struct inpcb *inp, int line) 1100 { 1101 struct tcp_hpts_entry *hpts; 1102 int32_t ret; 1103 1104 hpts = tcp_input_lock(inp); 1105 ret = __tcp_queue_to_input_locked(inp, hpts, line); 1106 mtx_unlock(&hpts->p_mtx); 1107 return (ret); 1108 } 1109 1110 void 1111 __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line) 1112 { 1113 struct tcp_hpts_entry *hpts; 1114 struct tcpcb *tp; 1115 1116 tp = intotcpcb(inp); 1117 hpts = tcp_input_lock(tp->t_inpcb); 1118 if (inp->inp_in_input == 0) { 1119 /* Ok we need to set it on the hpts in the current slot */ 1120 hpts_sane_input_insert(hpts, inp, line); 1121 if (hpts->p_hpts_active == 0) { 1122 /* 1123 * Activate the hpts if it is sleeping. 1124 */ 1125 hpts->p_direct_wake = 1; 1126 tcp_wakeinput(hpts); 1127 } 1128 } else if (hpts->p_hpts_active == 0) { 1129 hpts->p_direct_wake = 1; 1130 tcp_wakeinput(hpts); 1131 } 1132 inp->inp_hpts_drop_reas = reason; 1133 mtx_unlock(&hpts->p_mtx); 1134 } 1135 1136 static uint16_t 1137 hpts_random_cpu(struct inpcb *inp){ 1138 /* 1139 * No flow type set distribute the load randomly. 1140 */ 1141 uint16_t cpuid; 1142 uint32_t ran; 1143 1144 /* 1145 * If one has been set use it i.e. we want both in and out on the 1146 * same hpts. 1147 */ 1148 if (inp->inp_input_cpu_set) { 1149 return (inp->inp_input_cpu); 1150 } else if (inp->inp_hpts_cpu_set) { 1151 return (inp->inp_hpts_cpu); 1152 } 1153 /* Nothing set use a random number */ 1154 ran = arc4random(); 1155 cpuid = (ran & 0xffff) % mp_ncpus; 1156 return (cpuid); 1157 } 1158 1159 static uint16_t 1160 hpts_cpuid(struct inpcb *inp) 1161 { 1162 u_int cpuid; 1163 #if !defined(RSS) && defined(NUMA) 1164 struct hpts_domain_info *di; 1165 #endif 1166 1167 /* 1168 * If one has been set use it i.e. we want both in and out on the 1169 * same hpts. 1170 */ 1171 if (inp->inp_input_cpu_set) { 1172 return (inp->inp_input_cpu); 1173 } else if (inp->inp_hpts_cpu_set) { 1174 return (inp->inp_hpts_cpu); 1175 } 1176 /* If one is set the other must be the same */ 1177 #ifdef RSS 1178 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 1179 if (cpuid == NETISR_CPUID_NONE) 1180 return (hpts_random_cpu(inp)); 1181 else 1182 return (cpuid); 1183 #else 1184 /* 1185 * We don't have a flowid -> cpuid mapping, so cheat and just map 1186 * unknown cpuids to curcpu. Not the best, but apparently better 1187 * than defaulting to swi 0. 1188 */ 1189 1190 if (inp->inp_flowtype == M_HASHTYPE_NONE) 1191 return (hpts_random_cpu(inp)); 1192 /* 1193 * Hash to a thread based on the flowid. If we are using numa, 1194 * then restrict the hash to the numa domain where the inp lives. 1195 */ 1196 #ifdef NUMA 1197 if (tcp_bind_threads == 2 && inp->inp_numa_domain != M_NODOM) { 1198 di = &hpts_domains[inp->inp_numa_domain]; 1199 cpuid = di->cpu[inp->inp_flowid % di->count]; 1200 } else 1201 #endif 1202 cpuid = inp->inp_flowid % mp_ncpus; 1203 1204 return (cpuid); 1205 #endif 1206 } 1207 1208 static void 1209 tcp_drop_in_pkts(struct tcpcb *tp) 1210 { 1211 struct mbuf *m, *n; 1212 1213 m = tp->t_in_pkt; 1214 if (m) 1215 n = m->m_nextpkt; 1216 else 1217 n = NULL; 1218 tp->t_in_pkt = NULL; 1219 while (m) { 1220 m_freem(m); 1221 m = n; 1222 if (m) 1223 n = m->m_nextpkt; 1224 } 1225 } 1226 1227 /* 1228 * Do NOT try to optimize the processing of inp's 1229 * by first pulling off all the inp's into a temporary 1230 * list (e.g. TAILQ_CONCAT). If you do that the subtle 1231 * interactions of switching CPU's will kill because of 1232 * problems in the linked list manipulation. Basically 1233 * you would switch cpu's with the hpts mutex locked 1234 * but then while you were processing one of the inp's 1235 * some other one that you switch will get a new 1236 * packet on the different CPU. It will insert it 1237 * on the new hpts's input list. Creating a temporary 1238 * link in the inp will not fix it either, since 1239 * the other hpts will be doing the same thing and 1240 * you will both end up using the temporary link. 1241 * 1242 * You will die in an ASSERT for tailq corruption if you 1243 * run INVARIANTS or you will die horribly without 1244 * INVARIANTS in some unknown way with a corrupt linked 1245 * list. 1246 */ 1247 static void 1248 tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) 1249 { 1250 struct tcpcb *tp; 1251 struct inpcb *inp; 1252 uint16_t drop_reason; 1253 int16_t set_cpu; 1254 uint32_t did_prefetch = 0; 1255 int dropped; 1256 1257 HPTS_MTX_ASSERT(hpts); 1258 NET_EPOCH_ASSERT(); 1259 1260 while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { 1261 HPTS_MTX_ASSERT(hpts); 1262 hpts_sane_input_remove(hpts, inp, 0); 1263 if (inp->inp_input_cpu_set == 0) { 1264 set_cpu = 1; 1265 } else { 1266 set_cpu = 0; 1267 } 1268 hpts->p_inp = inp; 1269 drop_reason = inp->inp_hpts_drop_reas; 1270 inp->inp_in_input = 0; 1271 mtx_unlock(&hpts->p_mtx); 1272 INP_WLOCK(inp); 1273 #ifdef VIMAGE 1274 CURVNET_SET(inp->inp_vnet); 1275 #endif 1276 if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || 1277 (inp->inp_flags2 & INP_FREED)) { 1278 out: 1279 hpts->p_inp = NULL; 1280 if (in_pcbrele_wlocked(inp) == 0) { 1281 INP_WUNLOCK(inp); 1282 } 1283 #ifdef VIMAGE 1284 CURVNET_RESTORE(); 1285 #endif 1286 mtx_lock(&hpts->p_mtx); 1287 continue; 1288 } 1289 tp = intotcpcb(inp); 1290 if ((tp == NULL) || (tp->t_inpcb == NULL)) { 1291 goto out; 1292 } 1293 if (drop_reason) { 1294 /* This tcb is being destroyed for drop_reason */ 1295 tcp_drop_in_pkts(tp); 1296 tp = tcp_drop(tp, drop_reason); 1297 if (tp == NULL) { 1298 INP_WLOCK(inp); 1299 } 1300 if (in_pcbrele_wlocked(inp) == 0) 1301 INP_WUNLOCK(inp); 1302 #ifdef VIMAGE 1303 CURVNET_RESTORE(); 1304 #endif 1305 mtx_lock(&hpts->p_mtx); 1306 continue; 1307 } 1308 if (set_cpu) { 1309 /* 1310 * Setup so the next time we will move to the right 1311 * CPU. This should be a rare event. It will 1312 * sometimes happens when we are the client side 1313 * (usually not the server). Somehow tcp_output() 1314 * gets called before the tcp_do_segment() sets the 1315 * intial state. This means the r_cpu and r_hpts_cpu 1316 * is 0. We get on the hpts, and then tcp_input() 1317 * gets called setting up the r_cpu to the correct 1318 * value. The hpts goes off and sees the mis-match. 1319 * We simply correct it here and the CPU will switch 1320 * to the new hpts nextime the tcb gets added to the 1321 * the hpts (not this time) :-) 1322 */ 1323 tcp_set_hpts(inp); 1324 } 1325 if (tp->t_fb_ptr != NULL) { 1326 kern_prefetch(tp->t_fb_ptr, &did_prefetch); 1327 did_prefetch = 1; 1328 } 1329 if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) { 1330 if (inp->inp_in_input) 1331 tcp_hpts_remove(inp, HPTS_REMOVE_INPUT); 1332 dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0); 1333 if (dropped) { 1334 /* Re-acquire the wlock so we can release the reference */ 1335 INP_WLOCK(inp); 1336 } 1337 } else if (tp->t_in_pkt) { 1338 /* 1339 * We reach here only if we had a 1340 * stack that supported INP_SUPPORTS_MBUFQ 1341 * and then somehow switched to a stack that 1342 * does not. The packets are basically stranded 1343 * and would hang with the connection until 1344 * cleanup without this code. Its not the 1345 * best way but I know of no other way to 1346 * handle it since the stack needs functions 1347 * it does not have to handle queued packets. 1348 */ 1349 tcp_drop_in_pkts(tp); 1350 } 1351 if (in_pcbrele_wlocked(inp) == 0) 1352 INP_WUNLOCK(inp); 1353 INP_UNLOCK_ASSERT(inp); 1354 #ifdef VIMAGE 1355 CURVNET_RESTORE(); 1356 #endif 1357 mtx_lock(&hpts->p_mtx); 1358 hpts->p_inp = NULL; 1359 } 1360 } 1361 1362 static void 1363 tcp_hptsi(struct tcp_hpts_entry *hpts) 1364 { 1365 struct tcpcb *tp; 1366 struct inpcb *inp = NULL, *ninp; 1367 struct timeval tv; 1368 int32_t ticks_to_run, i, error; 1369 int32_t paced_cnt = 0; 1370 int32_t loop_cnt = 0; 1371 int32_t did_prefetch = 0; 1372 int32_t prefetch_ninp = 0; 1373 int32_t prefetch_tp = 0; 1374 int32_t wrap_loop_cnt = 0; 1375 int16_t set_cpu; 1376 1377 HPTS_MTX_ASSERT(hpts); 1378 NET_EPOCH_ASSERT(); 1379 1380 /* record previous info for any logging */ 1381 hpts->saved_lasttick = hpts->p_lasttick; 1382 hpts->saved_curtick = hpts->p_curtick; 1383 hpts->saved_curslot = hpts->p_cur_slot; 1384 hpts->saved_prev_slot = hpts->p_prev_slot; 1385 1386 hpts->p_lasttick = hpts->p_curtick; 1387 hpts->p_curtick = tcp_gethptstick(&tv); 1388 hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); 1389 if ((hpts->p_on_queue_cnt == 0) || 1390 (hpts->p_lasttick == hpts->p_curtick)) { 1391 /* 1392 * No time has yet passed, 1393 * or nothing to do. 1394 */ 1395 hpts->p_prev_slot = hpts->p_cur_slot; 1396 hpts->p_lasttick = hpts->p_curtick; 1397 goto no_run; 1398 } 1399 again: 1400 hpts->p_wheel_complete = 0; 1401 HPTS_MTX_ASSERT(hpts); 1402 ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot); 1403 if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) && 1404 (hpts->p_on_queue_cnt != 0)) { 1405 /* 1406 * Wheel wrap is occuring, basically we 1407 * are behind and the distance between 1408 * run's has spread so much it has exceeded 1409 * the time on the wheel (1.024 seconds). This 1410 * is ugly and should NOT be happening. We 1411 * need to run the entire wheel. We last processed 1412 * p_prev_slot, so that needs to be the last slot 1413 * we run. The next slot after that should be our 1414 * reserved first slot for new, and then starts 1415 * the running postion. Now the problem is the 1416 * reserved "not to yet" place does not exist 1417 * and there may be inp's in there that need 1418 * running. We can merge those into the 1419 * first slot at the head. 1420 */ 1421 wrap_loop_cnt++; 1422 hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1); 1423 hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2); 1424 /* 1425 * Adjust p_cur_slot to be where we are starting from 1426 * hopefully we will catch up (fat chance if something 1427 * is broken this bad :( ) 1428 */ 1429 hpts->p_cur_slot = hpts->p_prev_slot; 1430 /* 1431 * The next slot has guys to run too, and that would 1432 * be where we would normally start, lets move them into 1433 * the next slot (p_prev_slot + 2) so that we will 1434 * run them, the extra 10usecs of late (by being 1435 * put behind) does not really matter in this situation. 1436 */ 1437 #ifdef INVARIANTS 1438 /* 1439 * To prevent a panic we need to update the inpslot to the 1440 * new location. This is safe since it takes both the 1441 * INP lock and the pacer mutex to change the inp_hptsslot. 1442 */ 1443 TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) { 1444 inp->inp_hptsslot = hpts->p_runningtick; 1445 } 1446 #endif 1447 TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick], 1448 &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts); 1449 ticks_to_run = NUM_OF_HPTSI_SLOTS - 1; 1450 counter_u64_add(wheel_wrap, 1); 1451 } else { 1452 /* 1453 * Nxt slot is always one after p_runningtick though 1454 * its not used usually unless we are doing wheel wrap. 1455 */ 1456 hpts->p_nxt_slot = hpts->p_prev_slot; 1457 hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1); 1458 } 1459 #ifdef INVARIANTS 1460 if (TAILQ_EMPTY(&hpts->p_input) && 1461 (hpts->p_on_inqueue_cnt != 0)) { 1462 panic("tp:%p in_hpts input empty but cnt:%d", 1463 hpts, hpts->p_on_inqueue_cnt); 1464 } 1465 #endif 1466 HPTS_MTX_ASSERT(hpts); 1467 if (hpts->p_on_queue_cnt == 0) { 1468 goto no_one; 1469 } 1470 HPTS_MTX_ASSERT(hpts); 1471 for (i = 0; i < ticks_to_run; i++) { 1472 /* 1473 * Calculate our delay, if there are no extra ticks there 1474 * was not any (i.e. if ticks_to_run == 1, no delay). 1475 */ 1476 hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC; 1477 HPTS_MTX_ASSERT(hpts); 1478 while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) { 1479 /* For debugging */ 1480 hpts->p_inp = inp; 1481 paced_cnt++; 1482 #ifdef INVARIANTS 1483 if (hpts->p_runningtick != inp->inp_hptsslot) { 1484 panic("Hpts:%p inp:%p slot mis-aligned %u vs %u", 1485 hpts, inp, hpts->p_runningtick, inp->inp_hptsslot); 1486 } 1487 #endif 1488 /* Now pull it */ 1489 if (inp->inp_hpts_cpu_set == 0) { 1490 set_cpu = 1; 1491 } else { 1492 set_cpu = 0; 1493 } 1494 hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0); 1495 if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) { 1496 /* We prefetch the next inp if possible */ 1497 kern_prefetch(ninp, &prefetch_ninp); 1498 prefetch_ninp = 1; 1499 } 1500 if (inp->inp_hpts_request) { 1501 /* 1502 * This guy is deferred out further in time 1503 * then our wheel had available on it. 1504 * Push him back on the wheel or run it 1505 * depending. 1506 */ 1507 uint32_t maxticks, last_tick, remaining_slots; 1508 1509 remaining_slots = ticks_to_run - (i + 1); 1510 if (inp->inp_hpts_request > remaining_slots) { 1511 /* 1512 * How far out can we go? 1513 */ 1514 maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick); 1515 if (maxticks >= inp->inp_hpts_request) { 1516 /* we can place it finally to be processed */ 1517 inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request); 1518 inp->inp_hpts_request = 0; 1519 } else { 1520 /* Work off some more time */ 1521 inp->inp_hptsslot = last_tick; 1522 inp->inp_hpts_request-= maxticks; 1523 } 1524 hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1); 1525 hpts->p_inp = NULL; 1526 continue; 1527 } 1528 inp->inp_hpts_request = 0; 1529 /* Fall through we will so do it now */ 1530 } 1531 /* 1532 * We clear the hpts flag here after dealing with 1533 * remaining slots. This way anyone looking with the 1534 * TCB lock will see its on the hpts until just 1535 * before we unlock. 1536 */ 1537 inp->inp_in_hpts = 0; 1538 mtx_unlock(&hpts->p_mtx); 1539 INP_WLOCK(inp); 1540 if (in_pcbrele_wlocked(inp)) { 1541 mtx_lock(&hpts->p_mtx); 1542 hpts->p_inp = NULL; 1543 continue; 1544 } 1545 if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || 1546 (inp->inp_flags2 & INP_FREED)) { 1547 out_now: 1548 #ifdef INVARIANTS 1549 if (mtx_owned(&hpts->p_mtx)) { 1550 panic("Hpts:%p owns mtx prior-to lock line:%d", 1551 hpts, __LINE__); 1552 } 1553 #endif 1554 INP_WUNLOCK(inp); 1555 mtx_lock(&hpts->p_mtx); 1556 hpts->p_inp = NULL; 1557 continue; 1558 } 1559 tp = intotcpcb(inp); 1560 if ((tp == NULL) || (tp->t_inpcb == NULL)) { 1561 goto out_now; 1562 } 1563 if (set_cpu) { 1564 /* 1565 * Setup so the next time we will move to 1566 * the right CPU. This should be a rare 1567 * event. It will sometimes happens when we 1568 * are the client side (usually not the 1569 * server). Somehow tcp_output() gets called 1570 * before the tcp_do_segment() sets the 1571 * intial state. This means the r_cpu and 1572 * r_hpts_cpu is 0. We get on the hpts, and 1573 * then tcp_input() gets called setting up 1574 * the r_cpu to the correct value. The hpts 1575 * goes off and sees the mis-match. We 1576 * simply correct it here and the CPU will 1577 * switch to the new hpts nextime the tcb 1578 * gets added to the the hpts (not this one) 1579 * :-) 1580 */ 1581 tcp_set_hpts(inp); 1582 } 1583 #ifdef VIMAGE 1584 CURVNET_SET(inp->inp_vnet); 1585 #endif 1586 /* Lets do any logging that we might want to */ 1587 if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { 1588 tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i); 1589 } 1590 /* 1591 * There is a hole here, we get the refcnt on the 1592 * inp so it will still be preserved but to make 1593 * sure we can get the INP we need to hold the p_mtx 1594 * above while we pull out the tp/inp, as long as 1595 * fini gets the lock first we are assured of having 1596 * a sane INP we can lock and test. 1597 */ 1598 #ifdef INVARIANTS 1599 if (mtx_owned(&hpts->p_mtx)) { 1600 panic("Hpts:%p owns mtx before tcp-output:%d", 1601 hpts, __LINE__); 1602 } 1603 #endif 1604 if (tp->t_fb_ptr != NULL) { 1605 kern_prefetch(tp->t_fb_ptr, &did_prefetch); 1606 did_prefetch = 1; 1607 } 1608 if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) { 1609 error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0); 1610 if (error) { 1611 /* The input killed the connection */ 1612 goto skip_pacing; 1613 } 1614 } 1615 inp->inp_hpts_calls = 1; 1616 error = tp->t_fb->tfb_tcp_output(tp); 1617 inp->inp_hpts_calls = 0; 1618 if (ninp && ninp->inp_ppcb) { 1619 /* 1620 * If we have a nxt inp, see if we can 1621 * prefetch its ppcb. Note this may seem 1622 * "risky" since we have no locks (other 1623 * than the previous inp) and there no 1624 * assurance that ninp was not pulled while 1625 * we were processing inp and freed. If this 1626 * occured it could mean that either: 1627 * 1628 * a) Its NULL (which is fine we won't go 1629 * here) <or> b) Its valid (which is cool we 1630 * will prefetch it) <or> c) The inp got 1631 * freed back to the slab which was 1632 * reallocated. Then the piece of memory was 1633 * re-used and something else (not an 1634 * address) is in inp_ppcb. If that occurs 1635 * we don't crash, but take a TLB shootdown 1636 * performance hit (same as if it was NULL 1637 * and we tried to pre-fetch it). 1638 * 1639 * Considering that the likelyhood of <c> is 1640 * quite rare we will take a risk on doing 1641 * this. If performance drops after testing 1642 * we can always take this out. NB: the 1643 * kern_prefetch on amd64 actually has 1644 * protection against a bad address now via 1645 * the DMAP_() tests. This will prevent the 1646 * TLB hit, and instead if <c> occurs just 1647 * cause us to load cache with a useless 1648 * address (to us). 1649 */ 1650 kern_prefetch(ninp->inp_ppcb, &prefetch_tp); 1651 prefetch_tp = 1; 1652 } 1653 INP_WUNLOCK(inp); 1654 skip_pacing: 1655 #ifdef VIMAGE 1656 CURVNET_RESTORE(); 1657 #endif 1658 INP_UNLOCK_ASSERT(inp); 1659 #ifdef INVARIANTS 1660 if (mtx_owned(&hpts->p_mtx)) { 1661 panic("Hpts:%p owns mtx prior-to lock line:%d", 1662 hpts, __LINE__); 1663 } 1664 #endif 1665 mtx_lock(&hpts->p_mtx); 1666 hpts->p_inp = NULL; 1667 } 1668 HPTS_MTX_ASSERT(hpts); 1669 hpts->p_inp = NULL; 1670 hpts->p_runningtick++; 1671 if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) { 1672 hpts->p_runningtick = 0; 1673 } 1674 } 1675 no_one: 1676 HPTS_MTX_ASSERT(hpts); 1677 hpts->p_delayed_by = 0; 1678 /* 1679 * Check to see if we took an excess amount of time and need to run 1680 * more ticks (if we did not hit eno-bufs). 1681 */ 1682 #ifdef INVARIANTS 1683 if (TAILQ_EMPTY(&hpts->p_input) && 1684 (hpts->p_on_inqueue_cnt != 0)) { 1685 panic("tp:%p in_hpts input empty but cnt:%d", 1686 hpts, hpts->p_on_inqueue_cnt); 1687 } 1688 #endif 1689 hpts->p_prev_slot = hpts->p_cur_slot; 1690 hpts->p_lasttick = hpts->p_curtick; 1691 if (loop_cnt > max_pacer_loops) { 1692 /* 1693 * Something is serious slow we have 1694 * looped through processing the wheel 1695 * and by the time we cleared the 1696 * needs to run max_pacer_loops time 1697 * we still needed to run. That means 1698 * the system is hopelessly behind and 1699 * can never catch up :( 1700 * 1701 * We will just lie to this thread 1702 * and let it thing p_curtick is 1703 * correct. When it next awakens 1704 * it will find itself further behind. 1705 */ 1706 counter_u64_add(hpts_hopelessly_behind, 1); 1707 goto no_run; 1708 } 1709 hpts->p_curtick = tcp_gethptstick(&tv); 1710 hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); 1711 if ((wrap_loop_cnt < 2) && 1712 (hpts->p_lasttick != hpts->p_curtick)) { 1713 counter_u64_add(hpts_loops, 1); 1714 loop_cnt++; 1715 goto again; 1716 } 1717 no_run: 1718 /* 1719 * Set flag to tell that we are done for 1720 * any slot input that happens during 1721 * input. 1722 */ 1723 hpts->p_wheel_complete = 1; 1724 /* 1725 * Run any input that may be there not covered 1726 * in running data. 1727 */ 1728 if (!TAILQ_EMPTY(&hpts->p_input)) { 1729 tcp_input_data(hpts, &tv); 1730 /* 1731 * Now did we spend too long running 1732 * input and need to run more ticks? 1733 */ 1734 KASSERT(hpts->p_prev_slot == hpts->p_cur_slot, 1735 ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, 1736 hpts->p_prev_slot, hpts->p_cur_slot)); 1737 KASSERT(hpts->p_lasttick == hpts->p_curtick, 1738 ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, 1739 hpts->p_lasttick, hpts->p_curtick)); 1740 hpts->p_curtick = tcp_gethptstick(&tv); 1741 if (hpts->p_lasttick != hpts->p_curtick) { 1742 counter_u64_add(hpts_loops, 1); 1743 hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); 1744 goto again; 1745 } 1746 } 1747 { 1748 uint32_t t = 0, i, fnd = 0; 1749 1750 if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) { 1751 /* 1752 * Find next slot that is occupied and use that to 1753 * be the sleep time. 1754 */ 1755 for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) { 1756 if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) { 1757 fnd = 1; 1758 break; 1759 } 1760 t = (t + 1) % NUM_OF_HPTSI_SLOTS; 1761 } 1762 if (fnd) { 1763 hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max); 1764 } else { 1765 #ifdef INVARIANTS 1766 panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt); 1767 #endif 1768 counter_u64_add(back_tosleep, 1); 1769 hpts->p_on_queue_cnt = 0; 1770 goto non_found; 1771 } 1772 } else if (wrap_loop_cnt >= 2) { 1773 /* Special case handling */ 1774 hpts->p_hpts_sleep_time = tcp_min_hptsi_time; 1775 } else { 1776 /* No one on the wheel sleep for all but 400 slots or sleep max */ 1777 non_found: 1778 hpts->p_hpts_sleep_time = hpts_sleep_max; 1779 } 1780 } 1781 } 1782 1783 void 1784 __tcp_set_hpts(struct inpcb *inp, int32_t line) 1785 { 1786 struct tcp_hpts_entry *hpts; 1787 1788 INP_WLOCK_ASSERT(inp); 1789 hpts = tcp_hpts_lock(inp); 1790 if ((inp->inp_in_hpts == 0) && 1791 (inp->inp_hpts_cpu_set == 0)) { 1792 inp->inp_hpts_cpu = hpts_cpuid(inp); 1793 inp->inp_hpts_cpu_set = 1; 1794 } 1795 mtx_unlock(&hpts->p_mtx); 1796 hpts = tcp_input_lock(inp); 1797 if ((inp->inp_input_cpu_set == 0) && 1798 (inp->inp_in_input == 0)) { 1799 inp->inp_input_cpu = hpts_cpuid(inp); 1800 inp->inp_input_cpu_set = 1; 1801 } 1802 mtx_unlock(&hpts->p_mtx); 1803 } 1804 1805 uint16_t 1806 tcp_hpts_delayedby(struct inpcb *inp){ 1807 return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by); 1808 } 1809 1810 static void 1811 tcp_hpts_thread(void *ctx) 1812 { 1813 struct tcp_hpts_entry *hpts; 1814 struct epoch_tracker et; 1815 struct timeval tv; 1816 sbintime_t sb; 1817 1818 hpts = (struct tcp_hpts_entry *)ctx; 1819 mtx_lock(&hpts->p_mtx); 1820 if (hpts->p_direct_wake) { 1821 /* Signaled by input */ 1822 callout_stop(&hpts->co); 1823 } else { 1824 /* Timed out */ 1825 if (callout_pending(&hpts->co) || 1826 !callout_active(&hpts->co)) { 1827 mtx_unlock(&hpts->p_mtx); 1828 return; 1829 } 1830 callout_deactivate(&hpts->co); 1831 } 1832 hpts->p_hpts_wake_scheduled = 0; 1833 hpts->p_hpts_active = 1; 1834 NET_EPOCH_ENTER(et); 1835 tcp_hptsi(hpts); 1836 NET_EPOCH_EXIT(et); 1837 HPTS_MTX_ASSERT(hpts); 1838 tv.tv_sec = 0; 1839 tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; 1840 if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) { 1841 hpts->overidden_sleep = tv.tv_usec; 1842 tv.tv_usec = tcp_min_hptsi_time; 1843 hpts->p_on_min_sleep = 1; 1844 } else { 1845 /* Clear the min sleep flag */ 1846 hpts->overidden_sleep = 0; 1847 hpts->p_on_min_sleep = 0; 1848 } 1849 hpts->p_hpts_active = 0; 1850 sb = tvtosbt(tv); 1851 if (tcp_hpts_callout_skip_swi == 0) { 1852 callout_reset_sbt_on(&hpts->co, sb, 0, 1853 hpts_timeout_swi, hpts, hpts->p_cpu, 1854 (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 1855 } else { 1856 callout_reset_sbt_on(&hpts->co, sb, 0, 1857 hpts_timeout_dir, hpts, 1858 hpts->p_cpu, 1859 C_PREL(tcp_hpts_precision)); 1860 } 1861 hpts->p_direct_wake = 0; 1862 mtx_unlock(&hpts->p_mtx); 1863 } 1864 1865 #undef timersub 1866 1867 static void 1868 tcp_init_hptsi(void *st) 1869 { 1870 int32_t i, j, error, bound = 0, created = 0; 1871 size_t sz, asz; 1872 struct timeval tv; 1873 sbintime_t sb; 1874 struct tcp_hpts_entry *hpts; 1875 struct pcpu *pc; 1876 cpuset_t cs; 1877 char unit[16]; 1878 uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; 1879 int count, domain; 1880 1881 tcp_pace.rp_proc = NULL; 1882 tcp_pace.rp_num_hptss = ncpus; 1883 hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); 1884 hpts_loops = counter_u64_alloc(M_WAITOK); 1885 back_tosleep = counter_u64_alloc(M_WAITOK); 1886 combined_wheel_wrap = counter_u64_alloc(M_WAITOK); 1887 wheel_wrap = counter_u64_alloc(M_WAITOK); 1888 sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); 1889 tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); 1890 asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; 1891 for (i = 0; i < tcp_pace.rp_num_hptss; i++) { 1892 tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), 1893 M_TCPHPTS, M_WAITOK | M_ZERO); 1894 tcp_pace.rp_ent[i]->p_hptss = malloc(asz, 1895 M_TCPHPTS, M_WAITOK); 1896 hpts = tcp_pace.rp_ent[i]; 1897 /* 1898 * Init all the hpts structures that are not specifically 1899 * zero'd by the allocations. Also lets attach them to the 1900 * appropriate sysctl block as well. 1901 */ 1902 mtx_init(&hpts->p_mtx, "tcp_hpts_lck", 1903 "hpts", MTX_DEF | MTX_DUPOK); 1904 TAILQ_INIT(&hpts->p_input); 1905 for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { 1906 TAILQ_INIT(&hpts->p_hptss[j]); 1907 } 1908 sysctl_ctx_init(&hpts->hpts_ctx); 1909 sprintf(unit, "%d", i); 1910 hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, 1911 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), 1912 OID_AUTO, 1913 unit, 1914 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 1915 ""); 1916 SYSCTL_ADD_INT(&hpts->hpts_ctx, 1917 SYSCTL_CHILDREN(hpts->hpts_root), 1918 OID_AUTO, "in_qcnt", CTLFLAG_RD, 1919 &hpts->p_on_inqueue_cnt, 0, 1920 "Count TCB's awaiting input processing"); 1921 SYSCTL_ADD_INT(&hpts->hpts_ctx, 1922 SYSCTL_CHILDREN(hpts->hpts_root), 1923 OID_AUTO, "out_qcnt", CTLFLAG_RD, 1924 &hpts->p_on_queue_cnt, 0, 1925 "Count TCB's awaiting output processing"); 1926 SYSCTL_ADD_U16(&hpts->hpts_ctx, 1927 SYSCTL_CHILDREN(hpts->hpts_root), 1928 OID_AUTO, "active", CTLFLAG_RD, 1929 &hpts->p_hpts_active, 0, 1930 "Is the hpts active"); 1931 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1932 SYSCTL_CHILDREN(hpts->hpts_root), 1933 OID_AUTO, "curslot", CTLFLAG_RD, 1934 &hpts->p_cur_slot, 0, 1935 "What the current running pacers goal"); 1936 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1937 SYSCTL_CHILDREN(hpts->hpts_root), 1938 OID_AUTO, "runtick", CTLFLAG_RD, 1939 &hpts->p_runningtick, 0, 1940 "What the running pacers current slot is"); 1941 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1942 SYSCTL_CHILDREN(hpts->hpts_root), 1943 OID_AUTO, "curtick", CTLFLAG_RD, 1944 &hpts->p_curtick, 0, 1945 "What the running pacers last tick mapped to the wheel was"); 1946 hpts->p_hpts_sleep_time = hpts_sleep_max; 1947 hpts->p_num = i; 1948 hpts->p_curtick = tcp_gethptstick(&tv); 1949 hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); 1950 hpts->p_cpu = 0xffff; 1951 hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1); 1952 callout_init(&hpts->co, 1); 1953 } 1954 1955 /* Don't try to bind to NUMA domains if we don't have any */ 1956 if (vm_ndomains == 1 && tcp_bind_threads == 2) 1957 tcp_bind_threads = 0; 1958 1959 /* 1960 * Now lets start ithreads to handle the hptss. 1961 */ 1962 CPU_FOREACH(i) { 1963 hpts = tcp_pace.rp_ent[i]; 1964 hpts->p_cpu = i; 1965 error = swi_add(&hpts->ie, "hpts", 1966 tcp_hpts_thread, (void *)hpts, 1967 SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); 1968 if (error) { 1969 panic("Can't add hpts:%p i:%d err:%d", 1970 hpts, i, error); 1971 } 1972 created++; 1973 if (tcp_bind_threads == 1) { 1974 if (intr_event_bind(hpts->ie, i) == 0) 1975 bound++; 1976 } else if (tcp_bind_threads == 2) { 1977 pc = pcpu_find(i); 1978 domain = pc->pc_domain; 1979 CPU_COPY(&cpuset_domain[domain], &cs); 1980 if (intr_event_bind_ithread_cpuset(hpts->ie, &cs) 1981 == 0) { 1982 bound++; 1983 count = hpts_domains[domain].count; 1984 hpts_domains[domain].cpu[count] = i; 1985 hpts_domains[domain].count++; 1986 } 1987 } 1988 tv.tv_sec = 0; 1989 tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; 1990 sb = tvtosbt(tv); 1991 if (tcp_hpts_callout_skip_swi == 0) { 1992 callout_reset_sbt_on(&hpts->co, sb, 0, 1993 hpts_timeout_swi, hpts, hpts->p_cpu, 1994 (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 1995 } else { 1996 callout_reset_sbt_on(&hpts->co, sb, 0, 1997 hpts_timeout_dir, hpts, 1998 hpts->p_cpu, 1999 C_PREL(tcp_hpts_precision)); 2000 } 2001 } 2002 /* 2003 * If we somehow have an empty domain, fall back to choosing 2004 * among all htps threads. 2005 */ 2006 for (i = 0; i < vm_ndomains; i++) { 2007 if (hpts_domains[i].count == 0) { 2008 tcp_bind_threads = 0; 2009 break; 2010 } 2011 } 2012 2013 printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n", 2014 created, bound, 2015 tcp_bind_threads == 2 ? "NUMA domains" : "cpus"); 2016 } 2017 2018 SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL); 2019 MODULE_VERSION(tcphpts, 1); 2020