1 /*- 2 * Copyright (c) 2016-2018 Netflix Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 #include "opt_inet.h" 30 #include "opt_inet6.h" 31 #include "opt_ipsec.h" 32 #include "opt_tcpdebug.h" 33 /** 34 * Some notes about usage. 35 * 36 * The tcp_hpts system is designed to provide a high precision timer 37 * system for tcp. Its main purpose is to provide a mechanism for 38 * pacing packets out onto the wire. It can be used in two ways 39 * by a given TCP stack (and those two methods can be used simultaneously). 40 * 41 * First, and probably the main thing its used by Rack and BBR for, it can 42 * be used to call tcp_output() of a transport stack at some time in the future. 43 * The normal way this is done is that tcp_output() of the stack schedules 44 * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The 45 * slot is the time from now that the stack wants to be called but it 46 * must be converted to tcp_hpts's notion of slot. This is done with 47 * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical 48 * call from the tcp_output() routine might look like: 49 * 50 * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); 51 * 52 * The above would schedule tcp_ouput() to be called in 550 useconds. 53 * Note that if using this mechanism the stack will want to add near 54 * its top a check to prevent unwanted calls (from user land or the 55 * arrival of incoming ack's). So it would add something like: 56 * 57 * if (inp->inp_in_hpts) 58 * return; 59 * 60 * to prevent output processing until the time alotted has gone by. 61 * Of course this is a bare bones example and the stack will probably 62 * have more consideration then just the above. 63 * 64 * Now the tcp_hpts system will call tcp_output in one of two forms, 65 * it will first check to see if the stack as defined a 66 * tfb_tcp_output_wtime() function, if so that is the routine it 67 * will call, if that function is not defined then it will call the 68 * tfb_tcp_output() function. The only difference between these 69 * two calls is that the former passes the time in to the function 70 * so the function does not have to access the time (which tcp_hpts 71 * already has). What these functions do is of course totally up 72 * to the individual tcp stack. 73 * 74 * Now the second function (actually two functions I guess :D) 75 * the tcp_hpts system provides is the ability to either abort 76 * a connection (later) or process input on a connection. 77 * Why would you want to do this? To keep processor locality. 78 * 79 * So in order to use the input redirection function the 80 * stack changes its tcp_do_segment() routine to instead 81 * of process the data call the function: 82 * 83 * tcp_queue_pkt_to_input() 84 * 85 * You will note that the arguments to this function look 86 * a lot like tcp_do_segments's arguments. This function 87 * will assure that the tcp_hpts system will 88 * call the functions tfb_tcp_hpts_do_segment() from the 89 * correct CPU. Note that multiple calls can get pushed 90 * into the tcp_hpts system this will be indicated by 91 * the next to last argument to tfb_tcp_hpts_do_segment() 92 * (nxt_pkt). If nxt_pkt is a 1 then another packet is 93 * coming. If nxt_pkt is a 0 then this is the last call 94 * that the tcp_hpts system has available for the tcp stack. 95 * 96 * The other point of the input system is to be able to safely 97 * drop a tcp connection without worrying about the recursive 98 * locking that may be occuring on the INP_WLOCK. So if 99 * a stack wants to drop a connection it calls: 100 * 101 * tcp_set_inp_to_drop(tp, ETIMEDOUT) 102 * 103 * To schedule the tcp_hpts system to call 104 * 105 * tcp_drop(tp, drop_reason) 106 * 107 * at a future point. This is quite handy to prevent locking 108 * issues when dropping connections. 109 * 110 */ 111 112 #include <sys/param.h> 113 #include <sys/bus.h> 114 #include <sys/interrupt.h> 115 #include <sys/module.h> 116 #include <sys/kernel.h> 117 #include <sys/hhook.h> 118 #include <sys/malloc.h> 119 #include <sys/mbuf.h> 120 #include <sys/proc.h> /* for proc0 declaration */ 121 #include <sys/socket.h> 122 #include <sys/socketvar.h> 123 #include <sys/sysctl.h> 124 #include <sys/systm.h> 125 #include <sys/refcount.h> 126 #include <sys/sched.h> 127 #include <sys/queue.h> 128 #include <sys/smp.h> 129 #include <sys/counter.h> 130 #include <sys/time.h> 131 #include <sys/kthread.h> 132 #include <sys/kern_prefetch.h> 133 134 #include <vm/uma.h> 135 136 #include <net/route.h> 137 #include <net/vnet.h> 138 139 #define TCPSTATES /* for logging */ 140 141 #include <netinet/in.h> 142 #include <netinet/in_kdtrace.h> 143 #include <netinet/in_pcb.h> 144 #include <netinet/ip.h> 145 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 146 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 147 #include <netinet/ip_var.h> 148 #include <netinet/ip6.h> 149 #include <netinet6/in6_pcb.h> 150 #include <netinet6/ip6_var.h> 151 #include <netinet/tcp.h> 152 #include <netinet/tcp_fsm.h> 153 #include <netinet/tcp_seq.h> 154 #include <netinet/tcp_timer.h> 155 #include <netinet/tcp_var.h> 156 #include <netinet/tcpip.h> 157 #include <netinet/cc/cc.h> 158 #include <netinet/tcp_hpts.h> 159 160 #ifdef tcpdebug 161 #include <netinet/tcp_debug.h> 162 #endif /* tcpdebug */ 163 #ifdef tcp_offload 164 #include <netinet/tcp_offload.h> 165 #endif 166 167 #ifdef ipsec 168 #include <netipsec/ipsec.h> 169 #include <netipsec/ipsec6.h> 170 #endif /* ipsec */ 171 #include "opt_rss.h" 172 173 MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); 174 #ifdef RSS 175 static int tcp_bind_threads = 1; 176 #else 177 static int tcp_bind_threads = 0; 178 #endif 179 TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); 180 181 static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG; 182 183 TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size); 184 185 static struct tcp_hptsi tcp_pace; 186 187 static void tcp_wakehpts(struct tcp_hpts_entry *p); 188 static void tcp_wakeinput(struct tcp_hpts_entry *p); 189 static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); 190 static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick); 191 static void tcp_hpts_thread(void *ctx); 192 static void tcp_init_hptsi(void *st); 193 194 int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; 195 static int32_t tcp_hpts_callout_skip_swi = 0; 196 197 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls"); 198 199 #define timersub(tvp, uvp, vvp) \ 200 do { \ 201 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 202 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 203 if ((vvp)->tv_usec < 0) { \ 204 (vvp)->tv_sec--; \ 205 (vvp)->tv_usec += 1000000; \ 206 } \ 207 } while (0) 208 209 static int32_t logging_on = 0; 210 static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2); 211 static int32_t tcp_hpts_precision = 120; 212 213 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, 214 &tcp_hpts_precision, 120, 215 "Value for PRE() precision of callout"); 216 217 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, 218 &logging_on, 0, 219 "Turn on logging if compiled in"); 220 221 counter_u64_t hpts_loops; 222 223 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD, 224 &hpts_loops, "Number of times hpts had to loop to catch up"); 225 226 counter_u64_t back_tosleep; 227 228 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD, 229 &back_tosleep, "Number of times hpts found no tcbs"); 230 231 static int32_t in_newts_every_tcb = 0; 232 233 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW, 234 &in_newts_every_tcb, 0, 235 "Do we have a new cts every tcb we process for input"); 236 static int32_t in_ts_percision = 0; 237 238 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW, 239 &in_ts_percision, 0, 240 "Do we use percise timestamp for clients on input"); 241 static int32_t out_newts_every_tcb = 0; 242 243 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW, 244 &out_newts_every_tcb, 0, 245 "Do we have a new cts every tcb we process for output"); 246 static int32_t out_ts_percision = 0; 247 248 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW, 249 &out_ts_percision, 0, 250 "Do we use a percise timestamp for every output cts"); 251 252 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW, 253 &hpts_sleep_max, 0, 254 "The maximum time the hpts will sleep <1 - 254>"); 255 256 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW, 257 &tcp_min_hptsi_time, 0, 258 "The minimum time the hpts must sleep before processing more slots"); 259 260 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW, 261 &tcp_hpts_callout_skip_swi, 0, 262 "Do we have the callout call directly to the hpts?"); 263 264 static void 265 __tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot, 266 uint32_t ticknow, int32_t line) 267 { 268 struct hpts_log *pl; 269 270 HPTS_MTX_ASSERT(hpts); 271 if (hpts->p_log == NULL) 272 return; 273 pl = &hpts->p_log[hpts->p_log_at]; 274 hpts->p_log_at++; 275 if (hpts->p_log_at >= hpts->p_logsize) { 276 hpts->p_log_at = 0; 277 hpts->p_log_wrapped = 1; 278 } 279 pl->inp = inp; 280 if (inp) { 281 pl->t_paceslot = inp->inp_hptsslot; 282 pl->t_hptsreq = inp->inp_hpts_request; 283 pl->p_onhpts = inp->inp_in_hpts; 284 pl->p_oninput = inp->inp_in_input; 285 } else { 286 pl->t_paceslot = 0; 287 pl->t_hptsreq = 0; 288 pl->p_onhpts = 0; 289 pl->p_oninput = 0; 290 } 291 pl->is_notempty = 1; 292 pl->event = event; 293 pl->line = line; 294 pl->cts = tcp_get_usecs(NULL); 295 pl->p_curtick = hpts->p_curtick; 296 pl->p_prevtick = hpts->p_prevtick; 297 pl->p_on_queue_cnt = hpts->p_on_queue_cnt; 298 pl->ticknow = ticknow; 299 pl->slot_req = slot; 300 pl->p_nxt_slot = hpts->p_nxt_slot; 301 pl->p_cur_slot = hpts->p_cur_slot; 302 pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time; 303 pl->p_flags = (hpts->p_cpu & 0x7f); 304 pl->p_flags <<= 7; 305 pl->p_flags |= (hpts->p_num & 0x7f); 306 pl->p_flags <<= 2; 307 if (hpts->p_hpts_active) { 308 pl->p_flags |= HPTS_HPTS_ACTIVE; 309 } 310 } 311 312 #define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__) 313 314 static void 315 hpts_timeout_swi(void *arg) 316 { 317 struct tcp_hpts_entry *hpts; 318 319 hpts = (struct tcp_hpts_entry *)arg; 320 swi_sched(hpts->ie_cookie, 0); 321 } 322 323 static void 324 hpts_timeout_dir(void *arg) 325 { 326 tcp_hpts_thread(arg); 327 } 328 329 static inline void 330 hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear) 331 { 332 #ifdef INVARIANTS 333 if (mtx_owned(&hpts->p_mtx) == 0) { 334 /* We don't own the mutex? */ 335 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 336 } 337 if (hpts->p_cpu != inp->inp_hpts_cpu) { 338 /* It is not the right cpu/mutex? */ 339 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 340 } 341 if (inp->inp_in_hpts == 0) { 342 /* We are not on the hpts? */ 343 panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp); 344 } 345 if (TAILQ_EMPTY(head) && 346 (hpts->p_on_queue_cnt != 0)) { 347 /* We should not be empty with a queue count */ 348 panic("%s hpts:%p hpts bucket empty but cnt:%d", 349 __FUNCTION__, hpts, hpts->p_on_queue_cnt); 350 } 351 #endif 352 TAILQ_REMOVE(head, inp, inp_hpts); 353 hpts->p_on_queue_cnt--; 354 if (hpts->p_on_queue_cnt < 0) { 355 /* Count should not go negative .. */ 356 #ifdef INVARIANTS 357 panic("Hpts goes negative inp:%p hpts:%p", 358 inp, hpts); 359 #endif 360 hpts->p_on_queue_cnt = 0; 361 } 362 if (clear) { 363 inp->inp_hpts_request = 0; 364 inp->inp_in_hpts = 0; 365 } 366 } 367 368 static inline void 369 hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref) 370 { 371 #ifdef INVARIANTS 372 if (mtx_owned(&hpts->p_mtx) == 0) { 373 /* We don't own the mutex? */ 374 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 375 } 376 if (hpts->p_cpu != inp->inp_hpts_cpu) { 377 /* It is not the right cpu/mutex? */ 378 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 379 } 380 if ((noref == 0) && (inp->inp_in_hpts == 1)) { 381 /* We are already on the hpts? */ 382 panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp); 383 } 384 #endif 385 TAILQ_INSERT_TAIL(head, inp, inp_hpts); 386 inp->inp_in_hpts = 1; 387 hpts->p_on_queue_cnt++; 388 if (noref == 0) { 389 in_pcbref(inp); 390 } 391 } 392 393 static inline void 394 hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear) 395 { 396 #ifdef INVARIANTS 397 if (mtx_owned(&hpts->p_mtx) == 0) { 398 /* We don't own the mutex? */ 399 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 400 } 401 if (hpts->p_cpu != inp->inp_input_cpu) { 402 /* It is not the right cpu/mutex? */ 403 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 404 } 405 if (inp->inp_in_input == 0) { 406 /* We are not on the input hpts? */ 407 panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp); 408 } 409 #endif 410 TAILQ_REMOVE(&hpts->p_input, inp, inp_input); 411 hpts->p_on_inqueue_cnt--; 412 if (hpts->p_on_inqueue_cnt < 0) { 413 #ifdef INVARIANTS 414 panic("Hpts in goes negative inp:%p hpts:%p", 415 inp, hpts); 416 #endif 417 hpts->p_on_inqueue_cnt = 0; 418 } 419 #ifdef INVARIANTS 420 if (TAILQ_EMPTY(&hpts->p_input) && 421 (hpts->p_on_inqueue_cnt != 0)) { 422 /* We should not be empty with a queue count */ 423 panic("%s hpts:%p in_hpts input empty but cnt:%d", 424 __FUNCTION__, hpts, hpts->p_on_inqueue_cnt); 425 } 426 #endif 427 if (clear) 428 inp->inp_in_input = 0; 429 } 430 431 static inline void 432 hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line) 433 { 434 #ifdef INVARIANTS 435 if (mtx_owned(&hpts->p_mtx) == 0) { 436 /* We don't own the mutex? */ 437 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 438 } 439 if (hpts->p_cpu != inp->inp_input_cpu) { 440 /* It is not the right cpu/mutex? */ 441 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 442 } 443 if (inp->inp_in_input == 1) { 444 /* We are already on the input hpts? */ 445 panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp); 446 } 447 #endif 448 TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input); 449 inp->inp_in_input = 1; 450 hpts->p_on_inqueue_cnt++; 451 in_pcbref(inp); 452 } 453 454 static int 455 sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS) 456 { 457 struct tcp_hpts_entry *hpts; 458 size_t sz; 459 int32_t logging_was, i; 460 int32_t error = 0; 461 462 /* 463 * HACK: Turn off logging so no locks are required this really needs 464 * a memory barrier :) 465 */ 466 logging_was = logging_on; 467 logging_on = 0; 468 if (!req->oldptr) { 469 /* How much? */ 470 sz = 0; 471 for (i = 0; i < tcp_pace.rp_num_hptss; i++) { 472 hpts = tcp_pace.rp_ent[i]; 473 if (hpts->p_log == NULL) 474 continue; 475 sz += (sizeof(struct hpts_log) * hpts->p_logsize); 476 } 477 error = SYSCTL_OUT(req, 0, sz); 478 } else { 479 for (i = 0; i < tcp_pace.rp_num_hptss; i++) { 480 hpts = tcp_pace.rp_ent[i]; 481 if (hpts->p_log == NULL) 482 continue; 483 if (hpts->p_log_wrapped) 484 sz = (sizeof(struct hpts_log) * hpts->p_logsize); 485 else 486 sz = (sizeof(struct hpts_log) * hpts->p_log_at); 487 error = SYSCTL_OUT(req, hpts->p_log, sz); 488 } 489 } 490 logging_on = logging_was; 491 return error; 492 } 493 494 SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 495 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log"); 496 497 498 static void 499 tcp_wakehpts(struct tcp_hpts_entry *hpts) 500 { 501 HPTS_MTX_ASSERT(hpts); 502 swi_sched(hpts->ie_cookie, 0); 503 if (hpts->p_hpts_active == 2) { 504 /* Rare sleeping on a ENOBUF */ 505 wakeup_one(hpts); 506 } 507 } 508 509 static void 510 tcp_wakeinput(struct tcp_hpts_entry *hpts) 511 { 512 HPTS_MTX_ASSERT(hpts); 513 swi_sched(hpts->ie_cookie, 0); 514 if (hpts->p_hpts_active == 2) { 515 /* Rare sleeping on a ENOBUF */ 516 wakeup_one(hpts); 517 } 518 } 519 520 struct tcp_hpts_entry * 521 tcp_cur_hpts(struct inpcb *inp) 522 { 523 int32_t hpts_num; 524 struct tcp_hpts_entry *hpts; 525 526 hpts_num = inp->inp_hpts_cpu; 527 hpts = tcp_pace.rp_ent[hpts_num]; 528 return (hpts); 529 } 530 531 struct tcp_hpts_entry * 532 tcp_hpts_lock(struct inpcb *inp) 533 { 534 struct tcp_hpts_entry *hpts; 535 int32_t hpts_num; 536 537 again: 538 hpts_num = inp->inp_hpts_cpu; 539 hpts = tcp_pace.rp_ent[hpts_num]; 540 #ifdef INVARIANTS 541 if (mtx_owned(&hpts->p_mtx)) { 542 panic("Hpts:%p owns mtx prior-to lock line:%d", 543 hpts, __LINE__); 544 } 545 #endif 546 mtx_lock(&hpts->p_mtx); 547 if (hpts_num != inp->inp_hpts_cpu) { 548 mtx_unlock(&hpts->p_mtx); 549 goto again; 550 } 551 return (hpts); 552 } 553 554 struct tcp_hpts_entry * 555 tcp_input_lock(struct inpcb *inp) 556 { 557 struct tcp_hpts_entry *hpts; 558 int32_t hpts_num; 559 560 again: 561 hpts_num = inp->inp_input_cpu; 562 hpts = tcp_pace.rp_ent[hpts_num]; 563 #ifdef INVARIANTS 564 if (mtx_owned(&hpts->p_mtx)) { 565 panic("Hpts:%p owns mtx prior-to lock line:%d", 566 hpts, __LINE__); 567 } 568 #endif 569 mtx_lock(&hpts->p_mtx); 570 if (hpts_num != inp->inp_input_cpu) { 571 mtx_unlock(&hpts->p_mtx); 572 goto again; 573 } 574 return (hpts); 575 } 576 577 static void 578 tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line) 579 { 580 int32_t add_freed; 581 582 if (inp->inp_flags2 & INP_FREED) { 583 /* 584 * Need to play a special trick so that in_pcbrele_wlocked 585 * does not return 1 when it really should have returned 0. 586 */ 587 add_freed = 1; 588 inp->inp_flags2 &= ~INP_FREED; 589 } else { 590 add_freed = 0; 591 } 592 #ifndef INP_REF_DEBUG 593 if (in_pcbrele_wlocked(inp)) { 594 /* 595 * This should not happen. We have the inpcb referred to by 596 * the main socket (why we are called) and the hpts. It 597 * should always return 0. 598 */ 599 panic("inpcb:%p release ret 1", 600 inp); 601 } 602 #else 603 if (__in_pcbrele_wlocked(inp, line)) { 604 /* 605 * This should not happen. We have the inpcb referred to by 606 * the main socket (why we are called) and the hpts. It 607 * should always return 0. 608 */ 609 panic("inpcb:%p release ret 1", 610 inp); 611 } 612 #endif 613 if (add_freed) { 614 inp->inp_flags2 |= INP_FREED; 615 } 616 } 617 618 static void 619 tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) 620 { 621 if (inp->inp_in_hpts) { 622 hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1); 623 tcp_remove_hpts_ref(inp, hpts, line); 624 } 625 } 626 627 static void 628 tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) 629 { 630 HPTS_MTX_ASSERT(hpts); 631 if (inp->inp_in_input) { 632 hpts_sane_input_remove(hpts, inp, 1); 633 tcp_remove_hpts_ref(inp, hpts, line); 634 } 635 } 636 637 /* 638 * Called normally with the INP_LOCKED but it 639 * does not matter, the hpts lock is the key 640 * but the lock order allows us to hold the 641 * INP lock and then get the hpts lock. 642 * 643 * Valid values in the flags are 644 * HPTS_REMOVE_OUTPUT - remove from the output of the hpts. 645 * HPTS_REMOVE_INPUT - remove from the input of the hpts. 646 * Note that you can or both values together and get two 647 * actions. 648 */ 649 void 650 __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line) 651 { 652 struct tcp_hpts_entry *hpts; 653 654 INP_WLOCK_ASSERT(inp); 655 if (flags & HPTS_REMOVE_OUTPUT) { 656 hpts = tcp_hpts_lock(inp); 657 tcp_hpts_remove_locked_output(hpts, inp, flags, line); 658 mtx_unlock(&hpts->p_mtx); 659 } 660 if (flags & HPTS_REMOVE_INPUT) { 661 hpts = tcp_input_lock(inp); 662 tcp_hpts_remove_locked_input(hpts, inp, flags, line); 663 mtx_unlock(&hpts->p_mtx); 664 } 665 } 666 667 static inline int 668 hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus) 669 { 670 return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS); 671 } 672 673 static int 674 tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref) 675 { 676 int32_t need_wake = 0; 677 uint32_t ticknow = 0; 678 679 HPTS_MTX_ASSERT(hpts); 680 if (inp->inp_in_hpts == 0) { 681 /* Ok we need to set it on the hpts in the current slot */ 682 if (hpts->p_hpts_active == 0) { 683 /* A sleeping hpts we want in next slot to run */ 684 if (logging_on) { 685 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0, 686 hpts_tick(hpts, 1)); 687 } 688 inp->inp_hptsslot = hpts_tick(hpts, 1); 689 inp->inp_hpts_request = 0; 690 if (logging_on) { 691 tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow); 692 } 693 need_wake = 1; 694 } else if ((void *)inp == hpts->p_inp) { 695 /* 696 * We can't allow you to go into the same slot we 697 * are in. We must put you out. 698 */ 699 inp->inp_hptsslot = hpts->p_nxt_slot; 700 } else 701 inp->inp_hptsslot = hpts->p_cur_slot; 702 hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); 703 inp->inp_hpts_request = 0; 704 if (logging_on) { 705 tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0); 706 } 707 if (need_wake) { 708 /* 709 * Activate the hpts if it is sleeping and its 710 * timeout is not 1. 711 */ 712 if (logging_on) { 713 tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow); 714 } 715 hpts->p_direct_wake = 1; 716 tcp_wakehpts(hpts); 717 } 718 } 719 return (need_wake); 720 } 721 722 int 723 __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line) 724 { 725 int32_t ret; 726 struct tcp_hpts_entry *hpts; 727 728 INP_WLOCK_ASSERT(inp); 729 hpts = tcp_hpts_lock(inp); 730 ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); 731 mtx_unlock(&hpts->p_mtx); 732 return (ret); 733 } 734 735 static void 736 tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line, 737 struct hpts_diag *diag, int32_t noref) 738 { 739 int32_t need_new_to = 0; 740 int32_t need_wakeup = 0; 741 uint32_t largest_slot; 742 uint32_t ticknow = 0; 743 uint32_t slot_calc; 744 745 HPTS_MTX_ASSERT(hpts); 746 if (diag) { 747 memset(diag, 0, sizeof(struct hpts_diag)); 748 diag->p_hpts_active = hpts->p_hpts_active; 749 diag->p_nxt_slot = hpts->p_nxt_slot; 750 diag->p_cur_slot = hpts->p_cur_slot; 751 diag->slot_req = slot; 752 } 753 if ((inp->inp_in_hpts == 0) || noref) { 754 inp->inp_hpts_request = slot; 755 if (slot == 0) { 756 /* Immediate */ 757 tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref); 758 return; 759 } 760 if (hpts->p_hpts_active) { 761 /* 762 * Its slot - 1 since nxt_slot is the next tick that 763 * will go off since the hpts is awake 764 */ 765 if (logging_on) { 766 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0); 767 } 768 /* 769 * We want to make sure that we don't place a inp in 770 * the range of p_cur_slot <-> p_nxt_slot. If we 771 * take from p_nxt_slot to the end, plus p_cur_slot 772 * and then take away 2, we will know how many is 773 * the max slots we can use. 774 */ 775 if (hpts->p_nxt_slot > hpts->p_cur_slot) { 776 /* 777 * Non-wrap case nxt_slot <-> cur_slot we 778 * don't want to land in. So the diff gives 779 * us what is taken away from the number of 780 * slots. 781 */ 782 largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot); 783 } else if (hpts->p_nxt_slot == hpts->p_cur_slot) { 784 largest_slot = NUM_OF_HPTSI_SLOTS - 2; 785 } else { 786 /* 787 * Wrap case so the diff gives us the number 788 * of slots that we can land in. 789 */ 790 largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot; 791 } 792 /* 793 * We take away two so we never have a problem (20 794 * usec's) out of 1024000 usecs 795 */ 796 largest_slot -= 2; 797 if (inp->inp_hpts_request > largest_slot) { 798 /* 799 * Restrict max jump of slots and remember 800 * leftover 801 */ 802 slot = largest_slot; 803 inp->inp_hpts_request -= largest_slot; 804 } else { 805 /* This one will run when we hit it */ 806 inp->inp_hpts_request = 0; 807 } 808 if (hpts->p_nxt_slot == hpts->p_cur_slot) 809 slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS; 810 else 811 slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS; 812 if (slot_calc == hpts->p_cur_slot) { 813 #ifdef INVARIANTS 814 /* TSNH */ 815 panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n", 816 hpts, slot_calc, slot, largest_slot); 817 #endif 818 if (slot_calc) 819 slot_calc--; 820 else 821 slot_calc = NUM_OF_HPTSI_SLOTS - 1; 822 } 823 inp->inp_hptsslot = slot_calc; 824 if (diag) { 825 diag->inp_hptsslot = inp->inp_hptsslot; 826 } 827 } else { 828 /* 829 * The hpts is sleeping, we need to figure out where 830 * it will wake up at and if we need to reschedule 831 * its time-out. 832 */ 833 uint32_t have_slept, yet_to_sleep; 834 uint32_t slot_now; 835 struct timeval tv; 836 837 ticknow = tcp_gethptstick(&tv); 838 slot_now = ticknow % NUM_OF_HPTSI_SLOTS; 839 /* 840 * The user wants to be inserted at (slot_now + 841 * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up. 842 */ 843 largest_slot = NUM_OF_HPTSI_SLOTS - 2; 844 if (inp->inp_hpts_request > largest_slot) { 845 /* Adjust the residual in inp_hpts_request */ 846 slot = largest_slot; 847 inp->inp_hpts_request -= largest_slot; 848 } else { 849 /* No residual it all fits */ 850 inp->inp_hpts_request = 0; 851 } 852 inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS; 853 if (diag) { 854 diag->slot_now = slot_now; 855 diag->inp_hptsslot = inp->inp_hptsslot; 856 diag->p_on_min_sleep = hpts->p_on_min_sleep; 857 } 858 if (logging_on) { 859 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow); 860 } 861 /* Now do we need to restart the hpts's timer? */ 862 if (TSTMP_GT(ticknow, hpts->p_curtick)) 863 have_slept = ticknow - hpts->p_curtick; 864 else 865 have_slept = 0; 866 if (have_slept < hpts->p_hpts_sleep_time) { 867 /* This should be what happens */ 868 yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; 869 } else { 870 /* We are over-due */ 871 yet_to_sleep = 0; 872 need_wakeup = 1; 873 } 874 if (diag) { 875 diag->have_slept = have_slept; 876 diag->yet_to_sleep = yet_to_sleep; 877 diag->hpts_sleep_time = hpts->p_hpts_sleep_time; 878 } 879 if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) { 880 /* 881 * We need to reschedule the hptss time-out. 882 */ 883 hpts->p_hpts_sleep_time = slot; 884 need_new_to = slot * HPTS_TICKS_PER_USEC; 885 } 886 } 887 hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); 888 if (logging_on) { 889 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow); 890 } 891 /* 892 * Now how far is the hpts sleeping to? if active is 1, its 893 * up and ticking we do nothing, otherwise we may need to 894 * reschedule its callout if need_new_to is set from above. 895 */ 896 if (need_wakeup) { 897 if (logging_on) { 898 tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0); 899 } 900 hpts->p_direct_wake = 1; 901 tcp_wakehpts(hpts); 902 if (diag) { 903 diag->need_new_to = 0; 904 diag->co_ret = 0xffff0000; 905 } 906 } else if (need_new_to) { 907 int32_t co_ret; 908 struct timeval tv; 909 sbintime_t sb; 910 911 tv.tv_sec = 0; 912 tv.tv_usec = 0; 913 while (need_new_to > HPTS_USEC_IN_SEC) { 914 tv.tv_sec++; 915 need_new_to -= HPTS_USEC_IN_SEC; 916 } 917 tv.tv_usec = need_new_to; 918 sb = tvtosbt(tv); 919 if (tcp_hpts_callout_skip_swi == 0) { 920 co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, 921 hpts_timeout_swi, hpts, hpts->p_cpu, 922 (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 923 } else { 924 co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, 925 hpts_timeout_dir, hpts, 926 hpts->p_cpu, 927 C_PREL(tcp_hpts_precision)); 928 } 929 if (diag) { 930 diag->need_new_to = need_new_to; 931 diag->co_ret = co_ret; 932 } 933 } 934 } else { 935 #ifdef INVARIANTS 936 panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp); 937 #endif 938 } 939 } 940 941 uint32_t 942 tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){ 943 struct tcp_hpts_entry *hpts; 944 uint32_t slot_on, cts; 945 struct timeval tv; 946 947 /* 948 * We now return the next-slot the hpts will be on, beyond its 949 * current run (if up) or where it was when it stopped if it is 950 * sleeping. 951 */ 952 INP_WLOCK_ASSERT(inp); 953 hpts = tcp_hpts_lock(inp); 954 if (in_ts_percision) 955 microuptime(&tv); 956 else 957 getmicrouptime(&tv); 958 cts = tcp_tv_to_usectick(&tv); 959 tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0); 960 slot_on = hpts->p_nxt_slot; 961 mtx_unlock(&hpts->p_mtx); 962 return (slot_on); 963 } 964 965 uint32_t 966 __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){ 967 return (tcp_hpts_insert_diag(inp, slot, line, NULL)); 968 } 969 970 int 971 __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line) 972 { 973 int32_t retval = 0; 974 975 HPTS_MTX_ASSERT(hpts); 976 if (inp->inp_in_input == 0) { 977 /* Ok we need to set it on the hpts in the current slot */ 978 hpts_sane_input_insert(hpts, inp, line); 979 retval = 1; 980 if (hpts->p_hpts_active == 0) { 981 /* 982 * Activate the hpts if it is sleeping. 983 */ 984 if (logging_on) { 985 tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0); 986 } 987 retval = 2; 988 hpts->p_direct_wake = 1; 989 tcp_wakeinput(hpts); 990 } 991 } else if (hpts->p_hpts_active == 0) { 992 retval = 4; 993 hpts->p_direct_wake = 1; 994 tcp_wakeinput(hpts); 995 } 996 return (retval); 997 } 998 999 void 1000 tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 1001 int32_t tlen, int32_t drop_hdrlen, uint8_t iptos) 1002 { 1003 /* Setup packet for input first */ 1004 INP_WLOCK_ASSERT(tp->t_inpcb); 1005 m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t)); 1006 m->m_pkthdr.pace_tlen = (uint16_t) tlen; 1007 m->m_pkthdr.pace_drphdrlen = drop_hdrlen; 1008 m->m_pkthdr.pace_tos = iptos; 1009 m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0); 1010 if (tp->t_in_pkt == NULL) { 1011 tp->t_in_pkt = m; 1012 tp->t_tail_pkt = m; 1013 } else { 1014 tp->t_tail_pkt->m_nextpkt = m; 1015 tp->t_tail_pkt = m; 1016 } 1017 } 1018 1019 1020 int32_t 1021 __tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 1022 int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){ 1023 struct tcp_hpts_entry *hpts; 1024 int32_t ret; 1025 1026 tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos); 1027 hpts = tcp_input_lock(tp->t_inpcb); 1028 ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line); 1029 mtx_unlock(&hpts->p_mtx); 1030 return (ret); 1031 } 1032 1033 void 1034 __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line) 1035 { 1036 struct tcp_hpts_entry *hpts; 1037 struct tcpcb *tp; 1038 1039 tp = intotcpcb(inp); 1040 hpts = tcp_input_lock(tp->t_inpcb); 1041 if (inp->inp_in_input == 0) { 1042 /* Ok we need to set it on the hpts in the current slot */ 1043 hpts_sane_input_insert(hpts, inp, line); 1044 if (hpts->p_hpts_active == 0) { 1045 /* 1046 * Activate the hpts if it is sleeping. 1047 */ 1048 hpts->p_direct_wake = 1; 1049 tcp_wakeinput(hpts); 1050 } 1051 } else if (hpts->p_hpts_active == 0) { 1052 hpts->p_direct_wake = 1; 1053 tcp_wakeinput(hpts); 1054 } 1055 inp->inp_hpts_drop_reas = reason; 1056 mtx_unlock(&hpts->p_mtx); 1057 } 1058 1059 static uint16_t 1060 hpts_random_cpu(struct inpcb *inp){ 1061 /* 1062 * No flow type set distribute the load randomly. 1063 */ 1064 uint16_t cpuid; 1065 uint32_t ran; 1066 1067 /* 1068 * If one has been set use it i.e. we want both in and out on the 1069 * same hpts. 1070 */ 1071 if (inp->inp_input_cpu_set) { 1072 return (inp->inp_input_cpu); 1073 } else if (inp->inp_hpts_cpu_set) { 1074 return (inp->inp_hpts_cpu); 1075 } 1076 /* Nothing set use a random number */ 1077 ran = arc4random(); 1078 cpuid = (ran & 0xffff) % mp_ncpus; 1079 return (cpuid); 1080 } 1081 1082 static uint16_t 1083 hpts_cpuid(struct inpcb *inp){ 1084 uint16_t cpuid; 1085 1086 1087 /* 1088 * If one has been set use it i.e. we want both in and out on the 1089 * same hpts. 1090 */ 1091 if (inp->inp_input_cpu_set) { 1092 return (inp->inp_input_cpu); 1093 } else if (inp->inp_hpts_cpu_set) { 1094 return (inp->inp_hpts_cpu); 1095 } 1096 /* If one is set the other must be the same */ 1097 #ifdef RSS 1098 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 1099 if (cpuid == NETISR_CPUID_NONE) 1100 return (hpts_random_cpu(inp)); 1101 else 1102 return (cpuid); 1103 #else 1104 /* 1105 * We don't have a flowid -> cpuid mapping, so cheat and just map 1106 * unknown cpuids to curcpu. Not the best, but apparently better 1107 * than defaulting to swi 0. 1108 */ 1109 if (inp->inp_flowtype != M_HASHTYPE_NONE) { 1110 cpuid = inp->inp_flowid % mp_ncpus; 1111 return (cpuid); 1112 } 1113 cpuid = hpts_random_cpu(inp); 1114 return (cpuid); 1115 #endif 1116 } 1117 1118 /* 1119 * Do NOT try to optimize the processing of inp's 1120 * by first pulling off all the inp's into a temporary 1121 * list (e.g. TAILQ_CONCAT). If you do that the subtle 1122 * interactions of switching CPU's will kill because of 1123 * problems in the linked list manipulation. Basically 1124 * you would switch cpu's with the hpts mutex locked 1125 * but then while you were processing one of the inp's 1126 * some other one that you switch will get a new 1127 * packet on the different CPU. It will insert it 1128 * on the new hptss input list. Creating a temporary 1129 * link in the inp will not fix it either, since 1130 * the other hpts will be doing the same thing and 1131 * you will both end up using the temporary link. 1132 * 1133 * You will die in an ASSERT for tailq corruption if you 1134 * run INVARIANTS or you will die horribly without 1135 * INVARIANTS in some unknown way with a corrupt linked 1136 * list. 1137 */ 1138 static void 1139 tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) 1140 { 1141 struct mbuf *m, *n; 1142 struct tcpcb *tp; 1143 struct inpcb *inp; 1144 uint16_t drop_reason; 1145 int16_t set_cpu; 1146 uint32_t did_prefetch = 0; 1147 int32_t ti_locked = TI_UNLOCKED; 1148 struct epoch_tracker et; 1149 1150 HPTS_MTX_ASSERT(hpts); 1151 while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { 1152 HPTS_MTX_ASSERT(hpts); 1153 hpts_sane_input_remove(hpts, inp, 0); 1154 if (inp->inp_input_cpu_set == 0) { 1155 set_cpu = 1; 1156 } else { 1157 set_cpu = 0; 1158 } 1159 hpts->p_inp = inp; 1160 drop_reason = inp->inp_hpts_drop_reas; 1161 inp->inp_in_input = 0; 1162 mtx_unlock(&hpts->p_mtx); 1163 CURVNET_SET(inp->inp_vnet); 1164 if (drop_reason) { 1165 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1166 ti_locked = TI_RLOCKED; 1167 } else { 1168 ti_locked = TI_UNLOCKED; 1169 } 1170 INP_WLOCK(inp); 1171 if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || 1172 (inp->inp_flags2 & INP_FREED)) { 1173 out: 1174 hpts->p_inp = NULL; 1175 if (ti_locked == TI_RLOCKED) { 1176 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1177 } 1178 if (in_pcbrele_wlocked(inp) == 0) { 1179 INP_WUNLOCK(inp); 1180 } 1181 ti_locked = TI_UNLOCKED; 1182 CURVNET_RESTORE(); 1183 mtx_lock(&hpts->p_mtx); 1184 continue; 1185 } 1186 tp = intotcpcb(inp); 1187 if ((tp == NULL) || (tp->t_inpcb == NULL)) { 1188 goto out; 1189 } 1190 if (drop_reason) { 1191 /* This tcb is being destroyed for drop_reason */ 1192 m = tp->t_in_pkt; 1193 if (m) 1194 n = m->m_nextpkt; 1195 else 1196 n = NULL; 1197 tp->t_in_pkt = NULL; 1198 while (m) { 1199 m_freem(m); 1200 m = n; 1201 if (m) 1202 n = m->m_nextpkt; 1203 } 1204 tp = tcp_drop(tp, drop_reason); 1205 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1206 if (tp == NULL) { 1207 INP_WLOCK(inp); 1208 } 1209 if (in_pcbrele_wlocked(inp) == 0) 1210 INP_WUNLOCK(inp); 1211 CURVNET_RESTORE(); 1212 mtx_lock(&hpts->p_mtx); 1213 continue; 1214 } 1215 if (set_cpu) { 1216 /* 1217 * Setup so the next time we will move to the right 1218 * CPU. This should be a rare event. It will 1219 * sometimes happens when we are the client side 1220 * (usually not the server). Somehow tcp_output() 1221 * gets called before the tcp_do_segment() sets the 1222 * intial state. This means the r_cpu and r_hpts_cpu 1223 * is 0. We get on the hpts, and then tcp_input() 1224 * gets called setting up the r_cpu to the correct 1225 * value. The hpts goes off and sees the mis-match. 1226 * We simply correct it here and the CPU will switch 1227 * to the new hpts nextime the tcb gets added to the 1228 * the hpts (not this time) :-) 1229 */ 1230 tcp_set_hpts(inp); 1231 } 1232 m = tp->t_in_pkt; 1233 n = NULL; 1234 if (m != NULL && 1235 (m->m_pkthdr.pace_lock == TI_RLOCKED || 1236 tp->t_state != TCPS_ESTABLISHED)) { 1237 ti_locked = TI_RLOCKED; 1238 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1239 m = tp->t_in_pkt; 1240 } 1241 if (in_newts_every_tcb) { 1242 if (in_ts_percision) 1243 microuptime(tv); 1244 else 1245 getmicrouptime(tv); 1246 } 1247 if (tp->t_fb_ptr != NULL) { 1248 kern_prefetch(tp->t_fb_ptr, &did_prefetch); 1249 did_prefetch = 1; 1250 } 1251 /* Any input work to do, if so do it first */ 1252 if ((m != NULL) && (m == tp->t_in_pkt)) { 1253 struct tcphdr *th; 1254 int32_t tlen, drop_hdrlen, nxt_pkt; 1255 uint8_t iptos; 1256 1257 n = m->m_nextpkt; 1258 tp->t_in_pkt = tp->t_tail_pkt = NULL; 1259 while (m) { 1260 th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff); 1261 tlen = m->m_pkthdr.pace_tlen; 1262 drop_hdrlen = m->m_pkthdr.pace_drphdrlen; 1263 iptos = m->m_pkthdr.pace_tos; 1264 m->m_nextpkt = NULL; 1265 if (n) 1266 nxt_pkt = 1; 1267 else 1268 nxt_pkt = 0; 1269 inp->inp_input_calls = 1; 1270 if (tp->t_fb->tfb_tcp_hpts_do_segment) { 1271 /* Use the hpts specific do_segment */ 1272 (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket, 1273 tp, drop_hdrlen, 1274 tlen, iptos, nxt_pkt, tv); 1275 } else { 1276 /* Use the default do_segment */ 1277 (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket, 1278 tp, drop_hdrlen, 1279 tlen, iptos); 1280 } 1281 if (ti_locked == TI_RLOCKED) 1282 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1283 /* 1284 * Do segment returns unlocked we need the 1285 * lock again but we also need some kasserts 1286 * here. 1287 */ 1288 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1289 INP_UNLOCK_ASSERT(inp); 1290 m = n; 1291 if (m) 1292 n = m->m_nextpkt; 1293 if (m != NULL && 1294 m->m_pkthdr.pace_lock == TI_RLOCKED) { 1295 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1296 ti_locked = TI_RLOCKED; 1297 } else 1298 ti_locked = TI_UNLOCKED; 1299 INP_WLOCK(inp); 1300 /* 1301 * Since we have an opening here we must 1302 * re-check if the tcb went away while we 1303 * were getting the lock(s). 1304 */ 1305 if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || 1306 (inp->inp_flags2 & INP_FREED)) { 1307 while (m) { 1308 m_freem(m); 1309 m = n; 1310 if (m) 1311 n = m->m_nextpkt; 1312 } 1313 goto out; 1314 } 1315 /* 1316 * Now that we hold the INP lock, check if 1317 * we need to upgrade our lock. 1318 */ 1319 if (ti_locked == TI_UNLOCKED && 1320 (tp->t_state != TCPS_ESTABLISHED)) { 1321 ti_locked = TI_RLOCKED; 1322 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1323 } 1324 } /** end while(m) */ 1325 } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */ 1326 if (in_pcbrele_wlocked(inp) == 0) 1327 INP_WUNLOCK(inp); 1328 if (ti_locked == TI_RLOCKED) 1329 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1330 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1331 INP_UNLOCK_ASSERT(inp); 1332 ti_locked = TI_UNLOCKED; 1333 mtx_lock(&hpts->p_mtx); 1334 hpts->p_inp = NULL; 1335 CURVNET_RESTORE(); 1336 } 1337 } 1338 1339 static int 1340 tcp_hpts_est_run(struct tcp_hpts_entry *hpts) 1341 { 1342 int32_t ticks_to_run; 1343 1344 if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) { 1345 ticks_to_run = hpts->p_curtick - hpts->p_prevtick; 1346 if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) { 1347 ticks_to_run = NUM_OF_HPTSI_SLOTS - 2; 1348 } 1349 } else { 1350 if (hpts->p_prevtick == hpts->p_curtick) { 1351 /* This happens when we get woken up right away */ 1352 return (-1); 1353 } 1354 ticks_to_run = 1; 1355 } 1356 /* Set in where we will be when we catch up */ 1357 hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS; 1358 if (hpts->p_nxt_slot == hpts->p_cur_slot) { 1359 panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d", 1360 hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run); 1361 } 1362 return (ticks_to_run); 1363 } 1364 1365 static void 1366 tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) 1367 { 1368 struct tcpcb *tp; 1369 struct inpcb *inp = NULL, *ninp; 1370 struct timeval tv; 1371 int32_t ticks_to_run, i, error, tick_now, interum_tick; 1372 int32_t paced_cnt = 0; 1373 int32_t did_prefetch = 0; 1374 int32_t prefetch_ninp = 0; 1375 int32_t prefetch_tp = 0; 1376 uint32_t cts; 1377 int16_t set_cpu; 1378 1379 HPTS_MTX_ASSERT(hpts); 1380 hpts->p_curtick = tcp_tv_to_hptstick(ctick); 1381 cts = tcp_tv_to_usectick(ctick); 1382 memcpy(&tv, ctick, sizeof(struct timeval)); 1383 hpts->p_cur_slot = hpts_tick(hpts, 1); 1384 1385 /* Figure out if we had missed ticks */ 1386 again: 1387 HPTS_MTX_ASSERT(hpts); 1388 ticks_to_run = tcp_hpts_est_run(hpts); 1389 if (!TAILQ_EMPTY(&hpts->p_input)) { 1390 tcp_input_data(hpts, &tv); 1391 } 1392 #ifdef INVARIANTS 1393 if (TAILQ_EMPTY(&hpts->p_input) && 1394 (hpts->p_on_inqueue_cnt != 0)) { 1395 panic("tp:%p in_hpts input empty but cnt:%d", 1396 hpts, hpts->p_on_inqueue_cnt); 1397 } 1398 #endif 1399 HPTS_MTX_ASSERT(hpts); 1400 /* Reset the ticks to run and time if we need too */ 1401 interum_tick = tcp_gethptstick(&tv); 1402 if (interum_tick != hpts->p_curtick) { 1403 /* Save off the new time we execute to */ 1404 *ctick = tv; 1405 hpts->p_curtick = interum_tick; 1406 cts = tcp_tv_to_usectick(&tv); 1407 hpts->p_cur_slot = hpts_tick(hpts, 1); 1408 ticks_to_run = tcp_hpts_est_run(hpts); 1409 } 1410 if (ticks_to_run == -1) { 1411 goto no_run; 1412 } 1413 if (logging_on) { 1414 tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0); 1415 } 1416 if (hpts->p_on_queue_cnt == 0) { 1417 goto no_one; 1418 } 1419 HPTS_MTX_ASSERT(hpts); 1420 for (i = 0; i < ticks_to_run; i++) { 1421 /* 1422 * Calculate our delay, if there are no extra ticks there 1423 * was not any 1424 */ 1425 hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC; 1426 HPTS_MTX_ASSERT(hpts); 1427 while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { 1428 /* For debugging */ 1429 if (logging_on) { 1430 tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i); 1431 } 1432 hpts->p_inp = inp; 1433 paced_cnt++; 1434 if (hpts->p_cur_slot != inp->inp_hptsslot) { 1435 panic("Hpts:%p inp:%p slot mis-aligned %u vs %u", 1436 hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot); 1437 } 1438 /* Now pull it */ 1439 if (inp->inp_hpts_cpu_set == 0) { 1440 set_cpu = 1; 1441 } else { 1442 set_cpu = 0; 1443 } 1444 hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0); 1445 if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { 1446 /* We prefetch the next inp if possible */ 1447 kern_prefetch(ninp, &prefetch_ninp); 1448 prefetch_ninp = 1; 1449 } 1450 if (inp->inp_hpts_request) { 1451 /* 1452 * This guy is deferred out further in time 1453 * then our wheel had on it. Push him back 1454 * on the wheel. 1455 */ 1456 int32_t remaining_slots; 1457 1458 remaining_slots = ticks_to_run - (i + 1); 1459 if (inp->inp_hpts_request > remaining_slots) { 1460 /* 1461 * Keep INVARIANTS happy by clearing 1462 * the flag 1463 */ 1464 tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1); 1465 hpts->p_inp = NULL; 1466 continue; 1467 } 1468 inp->inp_hpts_request = 0; 1469 } 1470 /* 1471 * We clear the hpts flag here after dealing with 1472 * remaining slots. This way anyone looking with the 1473 * TCB lock will see its on the hpts until just 1474 * before we unlock. 1475 */ 1476 inp->inp_in_hpts = 0; 1477 mtx_unlock(&hpts->p_mtx); 1478 INP_WLOCK(inp); 1479 if (in_pcbrele_wlocked(inp)) { 1480 mtx_lock(&hpts->p_mtx); 1481 if (logging_on) 1482 tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1); 1483 hpts->p_inp = NULL; 1484 continue; 1485 } 1486 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 1487 out_now: 1488 #ifdef INVARIANTS 1489 if (mtx_owned(&hpts->p_mtx)) { 1490 panic("Hpts:%p owns mtx prior-to lock line:%d", 1491 hpts, __LINE__); 1492 } 1493 #endif 1494 INP_WUNLOCK(inp); 1495 mtx_lock(&hpts->p_mtx); 1496 if (logging_on) 1497 tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3); 1498 hpts->p_inp = NULL; 1499 continue; 1500 } 1501 tp = intotcpcb(inp); 1502 if ((tp == NULL) || (tp->t_inpcb == NULL)) { 1503 goto out_now; 1504 } 1505 if (set_cpu) { 1506 /* 1507 * Setup so the next time we will move to 1508 * the right CPU. This should be a rare 1509 * event. It will sometimes happens when we 1510 * are the client side (usually not the 1511 * server). Somehow tcp_output() gets called 1512 * before the tcp_do_segment() sets the 1513 * intial state. This means the r_cpu and 1514 * r_hpts_cpu is 0. We get on the hpts, and 1515 * then tcp_input() gets called setting up 1516 * the r_cpu to the correct value. The hpts 1517 * goes off and sees the mis-match. We 1518 * simply correct it here and the CPU will 1519 * switch to the new hpts nextime the tcb 1520 * gets added to the the hpts (not this one) 1521 * :-) 1522 */ 1523 tcp_set_hpts(inp); 1524 } 1525 if (out_newts_every_tcb) { 1526 struct timeval sv; 1527 1528 if (out_ts_percision) 1529 microuptime(&sv); 1530 else 1531 getmicrouptime(&sv); 1532 cts = tcp_tv_to_usectick(&sv); 1533 } 1534 CURVNET_SET(inp->inp_vnet); 1535 /* 1536 * There is a hole here, we get the refcnt on the 1537 * inp so it will still be preserved but to make 1538 * sure we can get the INP we need to hold the p_mtx 1539 * above while we pull out the tp/inp, as long as 1540 * fini gets the lock first we are assured of having 1541 * a sane INP we can lock and test. 1542 */ 1543 #ifdef INVARIANTS 1544 if (mtx_owned(&hpts->p_mtx)) { 1545 panic("Hpts:%p owns mtx before tcp-output:%d", 1546 hpts, __LINE__); 1547 } 1548 #endif 1549 if (tp->t_fb_ptr != NULL) { 1550 kern_prefetch(tp->t_fb_ptr, &did_prefetch); 1551 did_prefetch = 1; 1552 } 1553 inp->inp_hpts_calls = 1; 1554 if (tp->t_fb->tfb_tcp_output_wtime != NULL) { 1555 error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv); 1556 } else { 1557 error = tp->t_fb->tfb_tcp_output(tp); 1558 } 1559 if (ninp && ninp->inp_ppcb) { 1560 /* 1561 * If we have a nxt inp, see if we can 1562 * prefetch its ppcb. Note this may seem 1563 * "risky" since we have no locks (other 1564 * than the previous inp) and there no 1565 * assurance that ninp was not pulled while 1566 * we were processing inp and freed. If this 1567 * occured it could mean that either: 1568 * 1569 * a) Its NULL (which is fine we won't go 1570 * here) <or> b) Its valid (which is cool we 1571 * will prefetch it) <or> c) The inp got 1572 * freed back to the slab which was 1573 * reallocated. Then the piece of memory was 1574 * re-used and something else (not an 1575 * address) is in inp_ppcb. If that occurs 1576 * we don't crash, but take a TLB shootdown 1577 * performance hit (same as if it was NULL 1578 * and we tried to pre-fetch it). 1579 * 1580 * Considering that the likelyhood of <c> is 1581 * quite rare we will take a risk on doing 1582 * this. If performance drops after testing 1583 * we can always take this out. NB: the 1584 * kern_prefetch on amd64 actually has 1585 * protection against a bad address now via 1586 * the DMAP_() tests. This will prevent the 1587 * TLB hit, and instead if <c> occurs just 1588 * cause us to load cache with a useless 1589 * address (to us). 1590 */ 1591 kern_prefetch(ninp->inp_ppcb, &prefetch_tp); 1592 prefetch_tp = 1; 1593 } 1594 INP_WUNLOCK(inp); 1595 INP_UNLOCK_ASSERT(inp); 1596 CURVNET_RESTORE(); 1597 #ifdef INVARIANTS 1598 if (mtx_owned(&hpts->p_mtx)) { 1599 panic("Hpts:%p owns mtx prior-to lock line:%d", 1600 hpts, __LINE__); 1601 } 1602 #endif 1603 mtx_lock(&hpts->p_mtx); 1604 if (logging_on) 1605 tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4); 1606 hpts->p_inp = NULL; 1607 } 1608 HPTS_MTX_ASSERT(hpts); 1609 hpts->p_inp = NULL; 1610 hpts->p_cur_slot++; 1611 if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) { 1612 hpts->p_cur_slot = 0; 1613 } 1614 } 1615 no_one: 1616 HPTS_MTX_ASSERT(hpts); 1617 hpts->p_prevtick = hpts->p_curtick; 1618 hpts->p_delayed_by = 0; 1619 /* 1620 * Check to see if we took an excess amount of time and need to run 1621 * more ticks (if we did not hit eno-bufs). 1622 */ 1623 /* Re-run any input that may be there */ 1624 (void)tcp_gethptstick(&tv); 1625 if (!TAILQ_EMPTY(&hpts->p_input)) { 1626 tcp_input_data(hpts, &tv); 1627 } 1628 #ifdef INVARIANTS 1629 if (TAILQ_EMPTY(&hpts->p_input) && 1630 (hpts->p_on_inqueue_cnt != 0)) { 1631 panic("tp:%p in_hpts input empty but cnt:%d", 1632 hpts, hpts->p_on_inqueue_cnt); 1633 } 1634 #endif 1635 tick_now = tcp_gethptstick(&tv); 1636 if (SEQ_GT(tick_now, hpts->p_prevtick)) { 1637 struct timeval res; 1638 1639 /* Did we really spend a full tick or more in here? */ 1640 timersub(&tv, ctick, &res); 1641 if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) { 1642 counter_u64_add(hpts_loops, 1); 1643 if (logging_on) { 1644 tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now); 1645 } 1646 *ctick = res; 1647 hpts->p_curtick = tick_now; 1648 goto again; 1649 } 1650 } 1651 no_run: 1652 { 1653 uint32_t t = 0, i, fnd = 0; 1654 1655 if (hpts->p_on_queue_cnt) { 1656 1657 1658 /* 1659 * Find next slot that is occupied and use that to 1660 * be the sleep time. 1661 */ 1662 for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) { 1663 if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) { 1664 fnd = 1; 1665 break; 1666 } 1667 t = (t + 1) % NUM_OF_HPTSI_SLOTS; 1668 } 1669 if (fnd) { 1670 hpts->p_hpts_sleep_time = i; 1671 } else { 1672 counter_u64_add(back_tosleep, 1); 1673 #ifdef INVARIANTS 1674 panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt); 1675 #endif 1676 hpts->p_on_queue_cnt = 0; 1677 goto non_found; 1678 } 1679 t++; 1680 } else { 1681 /* No one on the wheel sleep for all but 2 slots */ 1682 non_found: 1683 if (hpts_sleep_max == 0) 1684 hpts_sleep_max = 1; 1685 hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max); 1686 t = 0; 1687 } 1688 if (logging_on) { 1689 tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC)); 1690 } 1691 } 1692 } 1693 1694 void 1695 __tcp_set_hpts(struct inpcb *inp, int32_t line) 1696 { 1697 struct tcp_hpts_entry *hpts; 1698 1699 INP_WLOCK_ASSERT(inp); 1700 hpts = tcp_hpts_lock(inp); 1701 if ((inp->inp_in_hpts == 0) && 1702 (inp->inp_hpts_cpu_set == 0)) { 1703 inp->inp_hpts_cpu = hpts_cpuid(inp); 1704 inp->inp_hpts_cpu_set = 1; 1705 } 1706 mtx_unlock(&hpts->p_mtx); 1707 hpts = tcp_input_lock(inp); 1708 if ((inp->inp_input_cpu_set == 0) && 1709 (inp->inp_in_input == 0)) { 1710 inp->inp_input_cpu = hpts_cpuid(inp); 1711 inp->inp_input_cpu_set = 1; 1712 } 1713 mtx_unlock(&hpts->p_mtx); 1714 } 1715 1716 uint16_t 1717 tcp_hpts_delayedby(struct inpcb *inp){ 1718 return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by); 1719 } 1720 1721 static void 1722 tcp_hpts_thread(void *ctx) 1723 { 1724 struct tcp_hpts_entry *hpts; 1725 struct timeval tv; 1726 sbintime_t sb; 1727 1728 hpts = (struct tcp_hpts_entry *)ctx; 1729 mtx_lock(&hpts->p_mtx); 1730 if (hpts->p_direct_wake) { 1731 /* Signaled by input */ 1732 if (logging_on) 1733 tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1); 1734 callout_stop(&hpts->co); 1735 } else { 1736 /* Timed out */ 1737 if (callout_pending(&hpts->co) || 1738 !callout_active(&hpts->co)) { 1739 if (logging_on) 1740 tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2); 1741 mtx_unlock(&hpts->p_mtx); 1742 return; 1743 } 1744 callout_deactivate(&hpts->co); 1745 if (logging_on) 1746 tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3); 1747 } 1748 hpts->p_hpts_active = 1; 1749 (void)tcp_gethptstick(&tv); 1750 tcp_hptsi(hpts, &tv); 1751 HPTS_MTX_ASSERT(hpts); 1752 tv.tv_sec = 0; 1753 tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; 1754 if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) { 1755 tv.tv_usec = tcp_min_hptsi_time; 1756 hpts->p_on_min_sleep = 1; 1757 } else { 1758 /* Clear the min sleep flag */ 1759 hpts->p_on_min_sleep = 0; 1760 } 1761 hpts->p_hpts_active = 0; 1762 sb = tvtosbt(tv); 1763 if (tcp_hpts_callout_skip_swi == 0) { 1764 callout_reset_sbt_on(&hpts->co, sb, 0, 1765 hpts_timeout_swi, hpts, hpts->p_cpu, 1766 (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 1767 } else { 1768 callout_reset_sbt_on(&hpts->co, sb, 0, 1769 hpts_timeout_dir, hpts, 1770 hpts->p_cpu, 1771 C_PREL(tcp_hpts_precision)); 1772 } 1773 hpts->p_direct_wake = 0; 1774 mtx_unlock(&hpts->p_mtx); 1775 } 1776 1777 #undef timersub 1778 1779 static void 1780 tcp_init_hptsi(void *st) 1781 { 1782 int32_t i, j, error, bound = 0, created = 0; 1783 size_t sz, asz; 1784 struct timeval tv; 1785 sbintime_t sb; 1786 struct tcp_hpts_entry *hpts; 1787 char unit[16]; 1788 uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; 1789 1790 tcp_pace.rp_proc = NULL; 1791 tcp_pace.rp_num_hptss = ncpus; 1792 hpts_loops = counter_u64_alloc(M_WAITOK); 1793 back_tosleep = counter_u64_alloc(M_WAITOK); 1794 1795 sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); 1796 tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); 1797 asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; 1798 for (i = 0; i < tcp_pace.rp_num_hptss; i++) { 1799 tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), 1800 M_TCPHPTS, M_WAITOK | M_ZERO); 1801 tcp_pace.rp_ent[i]->p_hptss = malloc(asz, 1802 M_TCPHPTS, M_WAITOK); 1803 hpts = tcp_pace.rp_ent[i]; 1804 /* 1805 * Init all the hpts structures that are not specifically 1806 * zero'd by the allocations. Also lets attach them to the 1807 * appropriate sysctl block as well. 1808 */ 1809 mtx_init(&hpts->p_mtx, "tcp_hpts_lck", 1810 "hpts", MTX_DEF | MTX_DUPOK); 1811 TAILQ_INIT(&hpts->p_input); 1812 for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { 1813 TAILQ_INIT(&hpts->p_hptss[j]); 1814 } 1815 sysctl_ctx_init(&hpts->hpts_ctx); 1816 sprintf(unit, "%d", i); 1817 hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, 1818 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), 1819 OID_AUTO, 1820 unit, 1821 CTLFLAG_RW, 0, 1822 ""); 1823 SYSCTL_ADD_INT(&hpts->hpts_ctx, 1824 SYSCTL_CHILDREN(hpts->hpts_root), 1825 OID_AUTO, "in_qcnt", CTLFLAG_RD, 1826 &hpts->p_on_inqueue_cnt, 0, 1827 "Count TCB's awaiting input processing"); 1828 SYSCTL_ADD_INT(&hpts->hpts_ctx, 1829 SYSCTL_CHILDREN(hpts->hpts_root), 1830 OID_AUTO, "out_qcnt", CTLFLAG_RD, 1831 &hpts->p_on_queue_cnt, 0, 1832 "Count TCB's awaiting output processing"); 1833 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1834 SYSCTL_CHILDREN(hpts->hpts_root), 1835 OID_AUTO, "active", CTLFLAG_RD, 1836 &hpts->p_hpts_active, 0, 1837 "Is the hpts active"); 1838 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1839 SYSCTL_CHILDREN(hpts->hpts_root), 1840 OID_AUTO, "curslot", CTLFLAG_RD, 1841 &hpts->p_cur_slot, 0, 1842 "What the current slot is if active"); 1843 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1844 SYSCTL_CHILDREN(hpts->hpts_root), 1845 OID_AUTO, "curtick", CTLFLAG_RD, 1846 &hpts->p_curtick, 0, 1847 "What the current tick on if active"); 1848 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1849 SYSCTL_CHILDREN(hpts->hpts_root), 1850 OID_AUTO, "logsize", CTLFLAG_RD, 1851 &hpts->p_logsize, 0, 1852 "Hpts logging buffer size"); 1853 hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2; 1854 hpts->p_num = i; 1855 hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv); 1856 hpts->p_prevtick -= 1; 1857 hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS; 1858 hpts->p_cpu = 0xffff; 1859 hpts->p_nxt_slot = 1; 1860 hpts->p_logsize = tcp_hpts_logging_size; 1861 if (hpts->p_logsize) { 1862 sz = (sizeof(struct hpts_log) * hpts->p_logsize); 1863 hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); 1864 } 1865 callout_init(&hpts->co, 1); 1866 } 1867 /* 1868 * Now lets start ithreads to handle the hptss. 1869 */ 1870 CPU_FOREACH(i) { 1871 hpts = tcp_pace.rp_ent[i]; 1872 hpts->p_cpu = i; 1873 error = swi_add(&hpts->ie, "hpts", 1874 tcp_hpts_thread, (void *)hpts, 1875 SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); 1876 if (error) { 1877 panic("Can't add hpts:%p i:%d err:%d", 1878 hpts, i, error); 1879 } 1880 created++; 1881 if (tcp_bind_threads) { 1882 if (intr_event_bind(hpts->ie, i) == 0) 1883 bound++; 1884 } 1885 tv.tv_sec = 0; 1886 tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; 1887 sb = tvtosbt(tv); 1888 if (tcp_hpts_callout_skip_swi == 0) { 1889 callout_reset_sbt_on(&hpts->co, sb, 0, 1890 hpts_timeout_swi, hpts, hpts->p_cpu, 1891 (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 1892 } else { 1893 callout_reset_sbt_on(&hpts->co, sb, 0, 1894 hpts_timeout_dir, hpts, 1895 hpts->p_cpu, 1896 C_PREL(tcp_hpts_precision)); 1897 } 1898 } 1899 printf("TCP Hpts created %d swi interrupt thread and bound %d\n", 1900 created, bound); 1901 return; 1902 } 1903 1904 SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL); 1905 MODULE_VERSION(tcphpts, 1); 1906