1 /*- 2 * Copyright (c) 2016-2018 Netflix Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 #include "opt_inet.h" 30 #include "opt_inet6.h" 31 #include "opt_ipsec.h" 32 #include "opt_tcpdebug.h" 33 /** 34 * Some notes about usage. 35 * 36 * The tcp_hpts system is designed to provide a high precision timer 37 * system for tcp. Its main purpose is to provide a mechanism for 38 * pacing packets out onto the wire. It can be used in two ways 39 * by a given TCP stack (and those two methods can be used simultaneously). 40 * 41 * First, and probably the main thing its used by Rack and BBR for, it can 42 * be used to call tcp_output() of a transport stack at some time in the future. 43 * The normal way this is done is that tcp_output() of the stack schedules 44 * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The 45 * slot is the time from now that the stack wants to be called but it 46 * must be converted to tcp_hpts's notion of slot. This is done with 47 * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical 48 * call from the tcp_output() routine might look like: 49 * 50 * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); 51 * 52 * The above would schedule tcp_ouput() to be called in 550 useconds. 53 * Note that if using this mechanism the stack will want to add near 54 * its top a check to prevent unwanted calls (from user land or the 55 * arrival of incoming ack's). So it would add something like: 56 * 57 * if (inp->inp_in_hpts) 58 * return; 59 * 60 * to prevent output processing until the time alotted has gone by. 61 * Of course this is a bare bones example and the stack will probably 62 * have more consideration then just the above. 63 * 64 * Now the tcp_hpts system will call tcp_output in one of two forms, 65 * it will first check to see if the stack as defined a 66 * tfb_tcp_output_wtime() function, if so that is the routine it 67 * will call, if that function is not defined then it will call the 68 * tfb_tcp_output() function. The only difference between these 69 * two calls is that the former passes the time in to the function 70 * so the function does not have to access the time (which tcp_hpts 71 * already has). What these functions do is of course totally up 72 * to the individual tcp stack. 73 * 74 * Now the second function (actually two functions I guess :D) 75 * the tcp_hpts system provides is the ability to either abort 76 * a connection (later) or process input on a connection. 77 * Why would you want to do this? To keep processor locality. 78 * 79 * So in order to use the input redirection function the 80 * stack changes its tcp_do_segment() routine to instead 81 * of process the data call the function: 82 * 83 * tcp_queue_pkt_to_input() 84 * 85 * You will note that the arguments to this function look 86 * a lot like tcp_do_segments's arguments. This function 87 * will assure that the tcp_hpts system will 88 * call the functions tfb_tcp_hpts_do_segment() from the 89 * correct CPU. Note that multiple calls can get pushed 90 * into the tcp_hpts system this will be indicated by 91 * the next to last argument to tfb_tcp_hpts_do_segment() 92 * (nxt_pkt). If nxt_pkt is a 1 then another packet is 93 * coming. If nxt_pkt is a 0 then this is the last call 94 * that the tcp_hpts system has available for the tcp stack. 95 * 96 * The other point of the input system is to be able to safely 97 * drop a tcp connection without worrying about the recursive 98 * locking that may be occuring on the INP_WLOCK. So if 99 * a stack wants to drop a connection it calls: 100 * 101 * tcp_set_inp_to_drop(tp, ETIMEDOUT) 102 * 103 * To schedule the tcp_hpts system to call 104 * 105 * tcp_drop(tp, drop_reason) 106 * 107 * at a future point. This is quite handy to prevent locking 108 * issues when dropping connections. 109 * 110 */ 111 112 #include <sys/param.h> 113 #include <sys/bus.h> 114 #include <sys/interrupt.h> 115 #include <sys/module.h> 116 #include <sys/kernel.h> 117 #include <sys/hhook.h> 118 #include <sys/malloc.h> 119 #include <sys/mbuf.h> 120 #include <sys/proc.h> /* for proc0 declaration */ 121 #include <sys/socket.h> 122 #include <sys/socketvar.h> 123 #include <sys/sysctl.h> 124 #include <sys/systm.h> 125 #include <sys/refcount.h> 126 #include <sys/sched.h> 127 #include <sys/queue.h> 128 #include <sys/smp.h> 129 #include <sys/counter.h> 130 #include <sys/time.h> 131 #include <sys/kthread.h> 132 #include <sys/kern_prefetch.h> 133 134 #include <vm/uma.h> 135 136 #include <net/route.h> 137 #include <net/vnet.h> 138 139 #define TCPSTATES /* for logging */ 140 141 #include <netinet/in.h> 142 #include <netinet/in_kdtrace.h> 143 #include <netinet/in_pcb.h> 144 #include <netinet/ip.h> 145 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 146 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 147 #include <netinet/ip_var.h> 148 #include <netinet/ip6.h> 149 #include <netinet6/in6_pcb.h> 150 #include <netinet6/ip6_var.h> 151 #include <netinet/tcp.h> 152 #include <netinet/tcp_fsm.h> 153 #include <netinet/tcp_seq.h> 154 #include <netinet/tcp_timer.h> 155 #include <netinet/tcp_var.h> 156 #include <netinet/tcpip.h> 157 #include <netinet/cc/cc.h> 158 #include <netinet/tcp_hpts.h> 159 160 #ifdef tcpdebug 161 #include <netinet/tcp_debug.h> 162 #endif /* tcpdebug */ 163 #ifdef tcp_offload 164 #include <netinet/tcp_offload.h> 165 #endif 166 167 #ifdef ipsec 168 #include <netipsec/ipsec.h> 169 #include <netipsec/ipsec6.h> 170 #endif /* ipsec */ 171 #include "opt_rss.h" 172 173 MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); 174 #ifdef RSS 175 static int tcp_bind_threads = 1; 176 #else 177 static int tcp_bind_threads = 0; 178 #endif 179 TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); 180 181 static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG; 182 183 TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size); 184 185 static struct tcp_hptsi tcp_pace; 186 187 static void tcp_wakehpts(struct tcp_hpts_entry *p); 188 static void tcp_wakeinput(struct tcp_hpts_entry *p); 189 static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); 190 static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick); 191 static void tcp_hpts_thread(void *ctx); 192 static void tcp_init_hptsi(void *st); 193 194 int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; 195 static int32_t tcp_hpts_callout_skip_swi = 0; 196 197 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls"); 198 199 #define timersub(tvp, uvp, vvp) \ 200 do { \ 201 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 202 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 203 if ((vvp)->tv_usec < 0) { \ 204 (vvp)->tv_sec--; \ 205 (vvp)->tv_usec += 1000000; \ 206 } \ 207 } while (0) 208 209 static int32_t logging_on = 0; 210 static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2); 211 static int32_t tcp_hpts_precision = 120; 212 213 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, 214 &tcp_hpts_precision, 120, 215 "Value for PRE() precision of callout"); 216 217 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, 218 &logging_on, 0, 219 "Turn on logging if compiled in"); 220 221 counter_u64_t hpts_loops; 222 223 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD, 224 &hpts_loops, "Number of times hpts had to loop to catch up"); 225 226 counter_u64_t back_tosleep; 227 228 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD, 229 &back_tosleep, "Number of times hpts found no tcbs"); 230 231 static int32_t in_newts_every_tcb = 0; 232 233 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW, 234 &in_newts_every_tcb, 0, 235 "Do we have a new cts every tcb we process for input"); 236 static int32_t in_ts_percision = 0; 237 238 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW, 239 &in_ts_percision, 0, 240 "Do we use percise timestamp for clients on input"); 241 static int32_t out_newts_every_tcb = 0; 242 243 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW, 244 &out_newts_every_tcb, 0, 245 "Do we have a new cts every tcb we process for output"); 246 static int32_t out_ts_percision = 0; 247 248 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW, 249 &out_ts_percision, 0, 250 "Do we use a percise timestamp for every output cts"); 251 252 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW, 253 &hpts_sleep_max, 0, 254 "The maximum time the hpts will sleep <1 - 254>"); 255 256 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW, 257 &tcp_min_hptsi_time, 0, 258 "The minimum time the hpts must sleep before processing more slots"); 259 260 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW, 261 &tcp_hpts_callout_skip_swi, 0, 262 "Do we have the callout call directly to the hpts?"); 263 264 static void 265 __tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot, 266 uint32_t ticknow, int32_t line) 267 { 268 struct hpts_log *pl; 269 270 HPTS_MTX_ASSERT(hpts); 271 if (hpts->p_log == NULL) 272 return; 273 pl = &hpts->p_log[hpts->p_log_at]; 274 hpts->p_log_at++; 275 if (hpts->p_log_at >= hpts->p_logsize) { 276 hpts->p_log_at = 0; 277 hpts->p_log_wrapped = 1; 278 } 279 pl->inp = inp; 280 if (inp) { 281 pl->t_paceslot = inp->inp_hptsslot; 282 pl->t_hptsreq = inp->inp_hpts_request; 283 pl->p_onhpts = inp->inp_in_hpts; 284 pl->p_oninput = inp->inp_in_input; 285 } else { 286 pl->t_paceslot = 0; 287 pl->t_hptsreq = 0; 288 pl->p_onhpts = 0; 289 pl->p_oninput = 0; 290 } 291 pl->is_notempty = 1; 292 pl->event = event; 293 pl->line = line; 294 pl->cts = tcp_get_usecs(NULL); 295 pl->p_curtick = hpts->p_curtick; 296 pl->p_prevtick = hpts->p_prevtick; 297 pl->p_on_queue_cnt = hpts->p_on_queue_cnt; 298 pl->ticknow = ticknow; 299 pl->slot_req = slot; 300 pl->p_nxt_slot = hpts->p_nxt_slot; 301 pl->p_cur_slot = hpts->p_cur_slot; 302 pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time; 303 pl->p_flags = (hpts->p_cpu & 0x7f); 304 pl->p_flags <<= 7; 305 pl->p_flags |= (hpts->p_num & 0x7f); 306 pl->p_flags <<= 2; 307 if (hpts->p_hpts_active) { 308 pl->p_flags |= HPTS_HPTS_ACTIVE; 309 } 310 } 311 312 #define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__) 313 314 static void 315 hpts_timeout_swi(void *arg) 316 { 317 struct tcp_hpts_entry *hpts; 318 319 hpts = (struct tcp_hpts_entry *)arg; 320 swi_sched(hpts->ie_cookie, 0); 321 } 322 323 static void 324 hpts_timeout_dir(void *arg) 325 { 326 tcp_hpts_thread(arg); 327 } 328 329 static inline void 330 hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear) 331 { 332 #ifdef INVARIANTS 333 if (mtx_owned(&hpts->p_mtx) == 0) { 334 /* We don't own the mutex? */ 335 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 336 } 337 if (hpts->p_cpu != inp->inp_hpts_cpu) { 338 /* It is not the right cpu/mutex? */ 339 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 340 } 341 if (inp->inp_in_hpts == 0) { 342 /* We are not on the hpts? */ 343 panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp); 344 } 345 if (TAILQ_EMPTY(head) && 346 (hpts->p_on_queue_cnt != 0)) { 347 /* We should not be empty with a queue count */ 348 panic("%s hpts:%p hpts bucket empty but cnt:%d", 349 __FUNCTION__, hpts, hpts->p_on_queue_cnt); 350 } 351 #endif 352 TAILQ_REMOVE(head, inp, inp_hpts); 353 hpts->p_on_queue_cnt--; 354 if (hpts->p_on_queue_cnt < 0) { 355 /* Count should not go negative .. */ 356 #ifdef INVARIANTS 357 panic("Hpts goes negative inp:%p hpts:%p", 358 inp, hpts); 359 #endif 360 hpts->p_on_queue_cnt = 0; 361 } 362 if (clear) { 363 inp->inp_hpts_request = 0; 364 inp->inp_in_hpts = 0; 365 } 366 } 367 368 static inline void 369 hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref) 370 { 371 #ifdef INVARIANTS 372 if (mtx_owned(&hpts->p_mtx) == 0) { 373 /* We don't own the mutex? */ 374 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 375 } 376 if (hpts->p_cpu != inp->inp_hpts_cpu) { 377 /* It is not the right cpu/mutex? */ 378 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 379 } 380 if ((noref == 0) && (inp->inp_in_hpts == 1)) { 381 /* We are already on the hpts? */ 382 panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp); 383 } 384 #endif 385 TAILQ_INSERT_TAIL(head, inp, inp_hpts); 386 inp->inp_in_hpts = 1; 387 hpts->p_on_queue_cnt++; 388 if (noref == 0) { 389 in_pcbref(inp); 390 } 391 } 392 393 static inline void 394 hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear) 395 { 396 #ifdef INVARIANTS 397 if (mtx_owned(&hpts->p_mtx) == 0) { 398 /* We don't own the mutex? */ 399 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 400 } 401 if (hpts->p_cpu != inp->inp_input_cpu) { 402 /* It is not the right cpu/mutex? */ 403 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 404 } 405 if (inp->inp_in_input == 0) { 406 /* We are not on the input hpts? */ 407 panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp); 408 } 409 #endif 410 TAILQ_REMOVE(&hpts->p_input, inp, inp_input); 411 hpts->p_on_inqueue_cnt--; 412 if (hpts->p_on_inqueue_cnt < 0) { 413 #ifdef INVARIANTS 414 panic("Hpts in goes negative inp:%p hpts:%p", 415 inp, hpts); 416 #endif 417 hpts->p_on_inqueue_cnt = 0; 418 } 419 #ifdef INVARIANTS 420 if (TAILQ_EMPTY(&hpts->p_input) && 421 (hpts->p_on_inqueue_cnt != 0)) { 422 /* We should not be empty with a queue count */ 423 panic("%s hpts:%p in_hpts input empty but cnt:%d", 424 __FUNCTION__, hpts, hpts->p_on_inqueue_cnt); 425 } 426 #endif 427 if (clear) 428 inp->inp_in_input = 0; 429 } 430 431 static inline void 432 hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line) 433 { 434 #ifdef INVARIANTS 435 if (mtx_owned(&hpts->p_mtx) == 0) { 436 /* We don't own the mutex? */ 437 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 438 } 439 if (hpts->p_cpu != inp->inp_input_cpu) { 440 /* It is not the right cpu/mutex? */ 441 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 442 } 443 if (inp->inp_in_input == 1) { 444 /* We are already on the input hpts? */ 445 panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp); 446 } 447 #endif 448 TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input); 449 inp->inp_in_input = 1; 450 hpts->p_on_inqueue_cnt++; 451 in_pcbref(inp); 452 } 453 454 static int 455 sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS) 456 { 457 struct tcp_hpts_entry *hpts; 458 size_t sz; 459 int32_t logging_was, i; 460 int32_t error = 0; 461 462 /* 463 * HACK: Turn off logging so no locks are required this really needs 464 * a memory barrier :) 465 */ 466 logging_was = logging_on; 467 logging_on = 0; 468 if (!req->oldptr) { 469 /* How much? */ 470 sz = 0; 471 for (i = 0; i < tcp_pace.rp_num_hptss; i++) { 472 hpts = tcp_pace.rp_ent[i]; 473 if (hpts->p_log == NULL) 474 continue; 475 sz += (sizeof(struct hpts_log) * hpts->p_logsize); 476 } 477 error = SYSCTL_OUT(req, 0, sz); 478 } else { 479 for (i = 0; i < tcp_pace.rp_num_hptss; i++) { 480 hpts = tcp_pace.rp_ent[i]; 481 if (hpts->p_log == NULL) 482 continue; 483 if (hpts->p_log_wrapped) 484 sz = (sizeof(struct hpts_log) * hpts->p_logsize); 485 else 486 sz = (sizeof(struct hpts_log) * hpts->p_log_at); 487 error = SYSCTL_OUT(req, hpts->p_log, sz); 488 } 489 } 490 logging_on = logging_was; 491 return error; 492 } 493 494 SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 495 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log"); 496 497 498 static void 499 tcp_wakehpts(struct tcp_hpts_entry *hpts) 500 { 501 HPTS_MTX_ASSERT(hpts); 502 swi_sched(hpts->ie_cookie, 0); 503 if (hpts->p_hpts_active == 2) { 504 /* Rare sleeping on a ENOBUF */ 505 wakeup_one(hpts); 506 } 507 } 508 509 static void 510 tcp_wakeinput(struct tcp_hpts_entry *hpts) 511 { 512 HPTS_MTX_ASSERT(hpts); 513 swi_sched(hpts->ie_cookie, 0); 514 if (hpts->p_hpts_active == 2) { 515 /* Rare sleeping on a ENOBUF */ 516 wakeup_one(hpts); 517 } 518 } 519 520 struct tcp_hpts_entry * 521 tcp_cur_hpts(struct inpcb *inp) 522 { 523 int32_t hpts_num; 524 struct tcp_hpts_entry *hpts; 525 526 hpts_num = inp->inp_hpts_cpu; 527 hpts = tcp_pace.rp_ent[hpts_num]; 528 return (hpts); 529 } 530 531 struct tcp_hpts_entry * 532 tcp_hpts_lock(struct inpcb *inp) 533 { 534 struct tcp_hpts_entry *hpts; 535 int32_t hpts_num; 536 537 again: 538 hpts_num = inp->inp_hpts_cpu; 539 hpts = tcp_pace.rp_ent[hpts_num]; 540 #ifdef INVARIANTS 541 if (mtx_owned(&hpts->p_mtx)) { 542 panic("Hpts:%p owns mtx prior-to lock line:%d", 543 hpts, __LINE__); 544 } 545 #endif 546 mtx_lock(&hpts->p_mtx); 547 if (hpts_num != inp->inp_hpts_cpu) { 548 mtx_unlock(&hpts->p_mtx); 549 goto again; 550 } 551 return (hpts); 552 } 553 554 struct tcp_hpts_entry * 555 tcp_input_lock(struct inpcb *inp) 556 { 557 struct tcp_hpts_entry *hpts; 558 int32_t hpts_num; 559 560 again: 561 hpts_num = inp->inp_input_cpu; 562 hpts = tcp_pace.rp_ent[hpts_num]; 563 #ifdef INVARIANTS 564 if (mtx_owned(&hpts->p_mtx)) { 565 panic("Hpts:%p owns mtx prior-to lock line:%d", 566 hpts, __LINE__); 567 } 568 #endif 569 mtx_lock(&hpts->p_mtx); 570 if (hpts_num != inp->inp_input_cpu) { 571 mtx_unlock(&hpts->p_mtx); 572 goto again; 573 } 574 return (hpts); 575 } 576 577 static void 578 tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line) 579 { 580 int32_t add_freed; 581 582 if (inp->inp_flags2 & INP_FREED) { 583 /* 584 * Need to play a special trick so that in_pcbrele_wlocked 585 * does not return 1 when it really should have returned 0. 586 */ 587 add_freed = 1; 588 inp->inp_flags2 &= ~INP_FREED; 589 } else { 590 add_freed = 0; 591 } 592 #ifndef INP_REF_DEBUG 593 if (in_pcbrele_wlocked(inp)) { 594 /* 595 * This should not happen. We have the inpcb referred to by 596 * the main socket (why we are called) and the hpts. It 597 * should always return 0. 598 */ 599 panic("inpcb:%p release ret 1", 600 inp); 601 } 602 #else 603 if (__in_pcbrele_wlocked(inp, line)) { 604 /* 605 * This should not happen. We have the inpcb referred to by 606 * the main socket (why we are called) and the hpts. It 607 * should always return 0. 608 */ 609 panic("inpcb:%p release ret 1", 610 inp); 611 } 612 #endif 613 if (add_freed) { 614 inp->inp_flags2 |= INP_FREED; 615 } 616 } 617 618 static void 619 tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) 620 { 621 if (inp->inp_in_hpts) { 622 hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1); 623 tcp_remove_hpts_ref(inp, hpts, line); 624 } 625 } 626 627 static void 628 tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) 629 { 630 HPTS_MTX_ASSERT(hpts); 631 if (inp->inp_in_input) { 632 hpts_sane_input_remove(hpts, inp, 1); 633 tcp_remove_hpts_ref(inp, hpts, line); 634 } 635 } 636 637 /* 638 * Called normally with the INP_LOCKED but it 639 * does not matter, the hpts lock is the key 640 * but the lock order allows us to hold the 641 * INP lock and then get the hpts lock. 642 * 643 * Valid values in the flags are 644 * HPTS_REMOVE_OUTPUT - remove from the output of the hpts. 645 * HPTS_REMOVE_INPUT - remove from the input of the hpts. 646 * Note that you can or both values together and get two 647 * actions. 648 */ 649 void 650 __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line) 651 { 652 struct tcp_hpts_entry *hpts; 653 654 INP_WLOCK_ASSERT(inp); 655 if (flags & HPTS_REMOVE_OUTPUT) { 656 hpts = tcp_hpts_lock(inp); 657 tcp_hpts_remove_locked_output(hpts, inp, flags, line); 658 mtx_unlock(&hpts->p_mtx); 659 } 660 if (flags & HPTS_REMOVE_INPUT) { 661 hpts = tcp_input_lock(inp); 662 tcp_hpts_remove_locked_input(hpts, inp, flags, line); 663 mtx_unlock(&hpts->p_mtx); 664 } 665 } 666 667 static inline int 668 hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus) 669 { 670 return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS); 671 } 672 673 static int 674 tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref) 675 { 676 int32_t need_wake = 0; 677 uint32_t ticknow = 0; 678 679 HPTS_MTX_ASSERT(hpts); 680 if (inp->inp_in_hpts == 0) { 681 /* Ok we need to set it on the hpts in the current slot */ 682 if (hpts->p_hpts_active == 0) { 683 /* A sleeping hpts we want in next slot to run */ 684 if (logging_on) { 685 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0, 686 hpts_tick(hpts, 1)); 687 } 688 inp->inp_hptsslot = hpts_tick(hpts, 1); 689 inp->inp_hpts_request = 0; 690 if (logging_on) { 691 tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow); 692 } 693 need_wake = 1; 694 } else if ((void *)inp == hpts->p_inp) { 695 /* 696 * We can't allow you to go into the same slot we 697 * are in. We must put you out. 698 */ 699 inp->inp_hptsslot = hpts->p_nxt_slot; 700 } else 701 inp->inp_hptsslot = hpts->p_cur_slot; 702 hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); 703 inp->inp_hpts_request = 0; 704 if (logging_on) { 705 tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0); 706 } 707 if (need_wake) { 708 /* 709 * Activate the hpts if it is sleeping and its 710 * timeout is not 1. 711 */ 712 if (logging_on) { 713 tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow); 714 } 715 hpts->p_direct_wake = 1; 716 tcp_wakehpts(hpts); 717 } 718 } 719 return (need_wake); 720 } 721 722 int 723 __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line) 724 { 725 int32_t ret; 726 struct tcp_hpts_entry *hpts; 727 728 INP_WLOCK_ASSERT(inp); 729 hpts = tcp_hpts_lock(inp); 730 ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); 731 mtx_unlock(&hpts->p_mtx); 732 return (ret); 733 } 734 735 static void 736 tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line, 737 struct hpts_diag *diag, int32_t noref) 738 { 739 int32_t need_new_to = 0; 740 int32_t need_wakeup = 0; 741 uint32_t largest_slot; 742 uint32_t ticknow = 0; 743 uint32_t slot_calc; 744 745 HPTS_MTX_ASSERT(hpts); 746 if (diag) { 747 memset(diag, 0, sizeof(struct hpts_diag)); 748 diag->p_hpts_active = hpts->p_hpts_active; 749 diag->p_nxt_slot = hpts->p_nxt_slot; 750 diag->p_cur_slot = hpts->p_cur_slot; 751 diag->slot_req = slot; 752 } 753 if ((inp->inp_in_hpts == 0) || noref) { 754 inp->inp_hpts_request = slot; 755 if (slot == 0) { 756 /* Immediate */ 757 tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref); 758 return; 759 } 760 if (hpts->p_hpts_active) { 761 /* 762 * Its slot - 1 since nxt_slot is the next tick that 763 * will go off since the hpts is awake 764 */ 765 if (logging_on) { 766 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0); 767 } 768 /* 769 * We want to make sure that we don't place a inp in 770 * the range of p_cur_slot <-> p_nxt_slot. If we 771 * take from p_nxt_slot to the end, plus p_cur_slot 772 * and then take away 2, we will know how many is 773 * the max slots we can use. 774 */ 775 if (hpts->p_nxt_slot > hpts->p_cur_slot) { 776 /* 777 * Non-wrap case nxt_slot <-> cur_slot we 778 * don't want to land in. So the diff gives 779 * us what is taken away from the number of 780 * slots. 781 */ 782 largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot); 783 } else if (hpts->p_nxt_slot == hpts->p_cur_slot) { 784 largest_slot = NUM_OF_HPTSI_SLOTS - 2; 785 } else { 786 /* 787 * Wrap case so the diff gives us the number 788 * of slots that we can land in. 789 */ 790 largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot; 791 } 792 /* 793 * We take away two so we never have a problem (20 794 * usec's) out of 1024000 usecs 795 */ 796 largest_slot -= 2; 797 if (inp->inp_hpts_request > largest_slot) { 798 /* 799 * Restrict max jump of slots and remember 800 * leftover 801 */ 802 slot = largest_slot; 803 inp->inp_hpts_request -= largest_slot; 804 } else { 805 /* This one will run when we hit it */ 806 inp->inp_hpts_request = 0; 807 } 808 if (hpts->p_nxt_slot == hpts->p_cur_slot) 809 slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS; 810 else 811 slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS; 812 if (slot_calc == hpts->p_cur_slot) { 813 #ifdef INVARIANTS 814 /* TSNH */ 815 panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n", 816 hpts, slot_calc, slot, largest_slot); 817 #endif 818 if (slot_calc) 819 slot_calc--; 820 else 821 slot_calc = NUM_OF_HPTSI_SLOTS - 1; 822 } 823 inp->inp_hptsslot = slot_calc; 824 if (diag) { 825 diag->inp_hptsslot = inp->inp_hptsslot; 826 } 827 } else { 828 /* 829 * The hpts is sleeping, we need to figure out where 830 * it will wake up at and if we need to reschedule 831 * its time-out. 832 */ 833 uint32_t have_slept, yet_to_sleep; 834 uint32_t slot_now; 835 struct timeval tv; 836 837 ticknow = tcp_gethptstick(&tv); 838 slot_now = ticknow % NUM_OF_HPTSI_SLOTS; 839 /* 840 * The user wants to be inserted at (slot_now + 841 * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up. 842 */ 843 largest_slot = NUM_OF_HPTSI_SLOTS - 2; 844 if (inp->inp_hpts_request > largest_slot) { 845 /* Adjust the residual in inp_hpts_request */ 846 slot = largest_slot; 847 inp->inp_hpts_request -= largest_slot; 848 } else { 849 /* No residual it all fits */ 850 inp->inp_hpts_request = 0; 851 } 852 inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS; 853 if (diag) { 854 diag->slot_now = slot_now; 855 diag->inp_hptsslot = inp->inp_hptsslot; 856 diag->p_on_min_sleep = hpts->p_on_min_sleep; 857 } 858 if (logging_on) { 859 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow); 860 } 861 /* Now do we need to restart the hpts's timer? */ 862 if (TSTMP_GT(ticknow, hpts->p_curtick)) 863 have_slept = ticknow - hpts->p_curtick; 864 else 865 have_slept = 0; 866 if (have_slept < hpts->p_hpts_sleep_time) { 867 /* This should be what happens */ 868 yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; 869 } else { 870 /* We are over-due */ 871 yet_to_sleep = 0; 872 need_wakeup = 1; 873 } 874 if (diag) { 875 diag->have_slept = have_slept; 876 diag->yet_to_sleep = yet_to_sleep; 877 diag->hpts_sleep_time = hpts->p_hpts_sleep_time; 878 } 879 if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) { 880 /* 881 * We need to reschedule the hptss time-out. 882 */ 883 hpts->p_hpts_sleep_time = slot; 884 need_new_to = slot * HPTS_TICKS_PER_USEC; 885 } 886 } 887 hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); 888 if (logging_on) { 889 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow); 890 } 891 /* 892 * Now how far is the hpts sleeping to? if active is 1, its 893 * up and ticking we do nothing, otherwise we may need to 894 * reschedule its callout if need_new_to is set from above. 895 */ 896 if (need_wakeup) { 897 if (logging_on) { 898 tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0); 899 } 900 hpts->p_direct_wake = 1; 901 tcp_wakehpts(hpts); 902 if (diag) { 903 diag->need_new_to = 0; 904 diag->co_ret = 0xffff0000; 905 } 906 } else if (need_new_to) { 907 int32_t co_ret; 908 struct timeval tv; 909 sbintime_t sb; 910 911 tv.tv_sec = 0; 912 tv.tv_usec = 0; 913 while (need_new_to > HPTS_USEC_IN_SEC) { 914 tv.tv_sec++; 915 need_new_to -= HPTS_USEC_IN_SEC; 916 } 917 tv.tv_usec = need_new_to; 918 sb = tvtosbt(tv); 919 if (tcp_hpts_callout_skip_swi == 0) { 920 co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, 921 hpts_timeout_swi, hpts, hpts->p_cpu, 922 (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 923 } else { 924 co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, 925 hpts_timeout_dir, hpts, 926 hpts->p_cpu, 927 C_PREL(tcp_hpts_precision)); 928 } 929 if (diag) { 930 diag->need_new_to = need_new_to; 931 diag->co_ret = co_ret; 932 } 933 } 934 } else { 935 #ifdef INVARIANTS 936 panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp); 937 #endif 938 } 939 } 940 941 uint32_t 942 tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){ 943 struct tcp_hpts_entry *hpts; 944 uint32_t slot_on, cts; 945 struct timeval tv; 946 947 /* 948 * We now return the next-slot the hpts will be on, beyond its 949 * current run (if up) or where it was when it stopped if it is 950 * sleeping. 951 */ 952 INP_WLOCK_ASSERT(inp); 953 hpts = tcp_hpts_lock(inp); 954 if (in_ts_percision) 955 microuptime(&tv); 956 else 957 getmicrouptime(&tv); 958 cts = tcp_tv_to_usectick(&tv); 959 tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0); 960 slot_on = hpts->p_nxt_slot; 961 mtx_unlock(&hpts->p_mtx); 962 return (slot_on); 963 } 964 965 uint32_t 966 __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){ 967 return (tcp_hpts_insert_diag(inp, slot, line, NULL)); 968 } 969 970 int 971 __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line) 972 { 973 int32_t retval = 0; 974 975 HPTS_MTX_ASSERT(hpts); 976 if (inp->inp_in_input == 0) { 977 /* Ok we need to set it on the hpts in the current slot */ 978 hpts_sane_input_insert(hpts, inp, line); 979 retval = 1; 980 if (hpts->p_hpts_active == 0) { 981 /* 982 * Activate the hpts if it is sleeping. 983 */ 984 if (logging_on) { 985 tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0); 986 } 987 retval = 2; 988 hpts->p_direct_wake = 1; 989 tcp_wakeinput(hpts); 990 } 991 } else if (hpts->p_hpts_active == 0) { 992 retval = 4; 993 hpts->p_direct_wake = 1; 994 tcp_wakeinput(hpts); 995 } 996 return (retval); 997 } 998 999 void 1000 tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 1001 int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked) 1002 { 1003 /* Setup packet for input first */ 1004 INP_WLOCK_ASSERT(tp->t_inpcb); 1005 m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t)); 1006 m->m_pkthdr.pace_tlen = (uint16_t) tlen; 1007 m->m_pkthdr.pace_drphdrlen = drop_hdrlen; 1008 m->m_pkthdr.pace_tos = iptos; 1009 m->m_pkthdr.pace_lock = (uint8_t) ti_locked; 1010 if (tp->t_in_pkt == NULL) { 1011 tp->t_in_pkt = m; 1012 tp->t_tail_pkt = m; 1013 } else { 1014 tp->t_tail_pkt->m_nextpkt = m; 1015 tp->t_tail_pkt = m; 1016 } 1017 } 1018 1019 1020 int32_t 1021 __tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 1022 int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line){ 1023 struct tcp_hpts_entry *hpts; 1024 int32_t ret; 1025 1026 tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos, ti_locked); 1027 hpts = tcp_input_lock(tp->t_inpcb); 1028 ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line); 1029 mtx_unlock(&hpts->p_mtx); 1030 return (ret); 1031 } 1032 1033 void 1034 __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line) 1035 { 1036 struct tcp_hpts_entry *hpts; 1037 struct tcpcb *tp; 1038 1039 tp = intotcpcb(inp); 1040 hpts = tcp_input_lock(tp->t_inpcb); 1041 if (inp->inp_in_input == 0) { 1042 /* Ok we need to set it on the hpts in the current slot */ 1043 hpts_sane_input_insert(hpts, inp, line); 1044 if (hpts->p_hpts_active == 0) { 1045 /* 1046 * Activate the hpts if it is sleeping. 1047 */ 1048 hpts->p_direct_wake = 1; 1049 tcp_wakeinput(hpts); 1050 } 1051 } else if (hpts->p_hpts_active == 0) { 1052 hpts->p_direct_wake = 1; 1053 tcp_wakeinput(hpts); 1054 } 1055 inp->inp_hpts_drop_reas = reason; 1056 mtx_unlock(&hpts->p_mtx); 1057 } 1058 1059 static uint16_t 1060 hpts_random_cpu(struct inpcb *inp){ 1061 /* 1062 * No flow type set distribute the load randomly. 1063 */ 1064 uint16_t cpuid; 1065 uint32_t ran; 1066 1067 /* 1068 * If one has been set use it i.e. we want both in and out on the 1069 * same hpts. 1070 */ 1071 if (inp->inp_input_cpu_set) { 1072 return (inp->inp_input_cpu); 1073 } else if (inp->inp_hpts_cpu_set) { 1074 return (inp->inp_hpts_cpu); 1075 } 1076 /* Nothing set use a random number */ 1077 ran = arc4random(); 1078 cpuid = (ran & 0xffff) % mp_ncpus; 1079 return (cpuid); 1080 } 1081 1082 static uint16_t 1083 hpts_cpuid(struct inpcb *inp){ 1084 uint16_t cpuid; 1085 1086 1087 /* 1088 * If one has been set use it i.e. we want both in and out on the 1089 * same hpts. 1090 */ 1091 if (inp->inp_input_cpu_set) { 1092 return (inp->inp_input_cpu); 1093 } else if (inp->inp_hpts_cpu_set) { 1094 return (inp->inp_hpts_cpu); 1095 } 1096 /* If one is set the other must be the same */ 1097 #ifdef RSS 1098 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 1099 if (cpuid == NETISR_CPUID_NONE) 1100 return (hpts_random_cpu(inp)); 1101 else 1102 return (cpuid); 1103 #else 1104 /* 1105 * We don't have a flowid -> cpuid mapping, so cheat and just map 1106 * unknown cpuids to curcpu. Not the best, but apparently better 1107 * than defaulting to swi 0. 1108 */ 1109 if (inp->inp_flowtype != M_HASHTYPE_NONE) { 1110 cpuid = inp->inp_flowid % mp_ncpus; 1111 return (cpuid); 1112 } 1113 cpuid = hpts_random_cpu(inp); 1114 return (cpuid); 1115 #endif 1116 } 1117 1118 /* 1119 * Do NOT try to optimize the processing of inp's 1120 * by first pulling off all the inp's into a temporary 1121 * list (e.g. TAILQ_CONCAT). If you do that the subtle 1122 * interactions of switching CPU's will kill because of 1123 * problems in the linked list manipulation. Basically 1124 * you would switch cpu's with the hpts mutex locked 1125 * but then while you were processing one of the inp's 1126 * some other one that you switch will get a new 1127 * packet on the different CPU. It will insert it 1128 * on the new hptss input list. Creating a temporary 1129 * link in the inp will not fix it either, since 1130 * the other hpts will be doing the same thing and 1131 * you will both end up using the temporary link. 1132 * 1133 * You will die in an ASSERT for tailq corruption if you 1134 * run INVARIANTS or you will die horribly without 1135 * INVARIANTS in some unknown way with a corrupt linked 1136 * list. 1137 */ 1138 static void 1139 tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) 1140 { 1141 struct mbuf *m, *n; 1142 struct tcpcb *tp; 1143 struct inpcb *inp; 1144 uint16_t drop_reason; 1145 int16_t set_cpu; 1146 uint32_t did_prefetch = 0; 1147 int32_t ti_locked = TI_UNLOCKED; 1148 1149 HPTS_MTX_ASSERT(hpts); 1150 while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { 1151 HPTS_MTX_ASSERT(hpts); 1152 hpts_sane_input_remove(hpts, inp, 0); 1153 if (inp->inp_input_cpu_set == 0) { 1154 set_cpu = 1; 1155 } else { 1156 set_cpu = 0; 1157 } 1158 hpts->p_inp = inp; 1159 drop_reason = inp->inp_hpts_drop_reas; 1160 inp->inp_in_input = 0; 1161 mtx_unlock(&hpts->p_mtx); 1162 CURVNET_SET(inp->inp_vnet); 1163 if (drop_reason) { 1164 INP_INFO_RLOCK(&V_tcbinfo); 1165 ti_locked = TI_RLOCKED; 1166 } else { 1167 ti_locked = TI_UNLOCKED; 1168 } 1169 INP_WLOCK(inp); 1170 if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || 1171 (inp->inp_flags2 & INP_FREED)) { 1172 out: 1173 hpts->p_inp = NULL; 1174 if (ti_locked == TI_RLOCKED) { 1175 INP_INFO_RUNLOCK(&V_tcbinfo); 1176 } 1177 if (in_pcbrele_wlocked(inp) == 0) { 1178 INP_WUNLOCK(inp); 1179 } 1180 ti_locked = TI_UNLOCKED; 1181 CURVNET_RESTORE(); 1182 mtx_lock(&hpts->p_mtx); 1183 continue; 1184 } 1185 tp = intotcpcb(inp); 1186 if ((tp == NULL) || (tp->t_inpcb == NULL)) { 1187 goto out; 1188 } 1189 if (drop_reason) { 1190 /* This tcb is being destroyed for drop_reason */ 1191 m = tp->t_in_pkt; 1192 if (m) 1193 n = m->m_nextpkt; 1194 else 1195 n = NULL; 1196 tp->t_in_pkt = NULL; 1197 while (m) { 1198 m_freem(m); 1199 m = n; 1200 if (m) 1201 n = m->m_nextpkt; 1202 } 1203 tp = tcp_drop(tp, drop_reason); 1204 INP_INFO_RUNLOCK(&V_tcbinfo); 1205 if (tp == NULL) { 1206 INP_WLOCK(inp); 1207 } 1208 if (in_pcbrele_wlocked(inp) == 0) 1209 INP_WUNLOCK(inp); 1210 CURVNET_RESTORE(); 1211 mtx_lock(&hpts->p_mtx); 1212 continue; 1213 } 1214 if (set_cpu) { 1215 /* 1216 * Setup so the next time we will move to the right 1217 * CPU. This should be a rare event. It will 1218 * sometimes happens when we are the client side 1219 * (usually not the server). Somehow tcp_output() 1220 * gets called before the tcp_do_segment() sets the 1221 * intial state. This means the r_cpu and r_hpts_cpu 1222 * is 0. We get on the hpts, and then tcp_input() 1223 * gets called setting up the r_cpu to the correct 1224 * value. The hpts goes off and sees the mis-match. 1225 * We simply correct it here and the CPU will switch 1226 * to the new hpts nextime the tcb gets added to the 1227 * the hpts (not this time) :-) 1228 */ 1229 tcp_set_hpts(inp); 1230 } 1231 m = tp->t_in_pkt; 1232 n = NULL; 1233 if (m != NULL && 1234 (m->m_pkthdr.pace_lock == TI_RLOCKED || 1235 tp->t_state != TCPS_ESTABLISHED)) { 1236 ti_locked = TI_RLOCKED; 1237 INP_INFO_RLOCK(&V_tcbinfo); 1238 m = tp->t_in_pkt; 1239 } 1240 if (in_newts_every_tcb) { 1241 if (in_ts_percision) 1242 microuptime(tv); 1243 else 1244 getmicrouptime(tv); 1245 } 1246 if (tp->t_fb_ptr != NULL) { 1247 kern_prefetch(tp->t_fb_ptr, &did_prefetch); 1248 did_prefetch = 1; 1249 } 1250 /* Any input work to do, if so do it first */ 1251 if ((m != NULL) && (m == tp->t_in_pkt)) { 1252 struct tcphdr *th; 1253 int32_t tlen, drop_hdrlen, nxt_pkt; 1254 uint8_t iptos; 1255 1256 n = m->m_nextpkt; 1257 tp->t_in_pkt = tp->t_tail_pkt = NULL; 1258 while (m) { 1259 th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff); 1260 tlen = m->m_pkthdr.pace_tlen; 1261 drop_hdrlen = m->m_pkthdr.pace_drphdrlen; 1262 iptos = m->m_pkthdr.pace_tos; 1263 m->m_nextpkt = NULL; 1264 if (n) 1265 nxt_pkt = 1; 1266 else 1267 nxt_pkt = 0; 1268 inp->inp_input_calls = 1; 1269 if (tp->t_fb->tfb_tcp_hpts_do_segment) { 1270 /* Use the hpts specific do_segment */ 1271 (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket, 1272 tp, drop_hdrlen, 1273 tlen, iptos, ti_locked, nxt_pkt, tv); 1274 } else { 1275 /* Use the default do_segment */ 1276 (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket, 1277 tp, drop_hdrlen, 1278 tlen, iptos, ti_locked); 1279 } 1280 /* 1281 * Do segment returns unlocked we need the 1282 * lock again but we also need some kasserts 1283 * here. 1284 */ 1285 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1286 INP_UNLOCK_ASSERT(inp); 1287 m = n; 1288 if (m) 1289 n = m->m_nextpkt; 1290 if (m != NULL && 1291 m->m_pkthdr.pace_lock == TI_RLOCKED) { 1292 INP_INFO_RLOCK(&V_tcbinfo); 1293 ti_locked = TI_RLOCKED; 1294 } else 1295 ti_locked = TI_UNLOCKED; 1296 INP_WLOCK(inp); 1297 /* 1298 * Since we have an opening here we must 1299 * re-check if the tcb went away while we 1300 * were getting the lock(s). 1301 */ 1302 if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || 1303 (inp->inp_flags2 & INP_FREED)) { 1304 while (m) { 1305 m_freem(m); 1306 m = n; 1307 if (m) 1308 n = m->m_nextpkt; 1309 } 1310 goto out; 1311 } 1312 /* 1313 * Now that we hold the INP lock, check if 1314 * we need to upgrade our lock. 1315 */ 1316 if (ti_locked == TI_UNLOCKED && 1317 (tp->t_state != TCPS_ESTABLISHED)) { 1318 ti_locked = TI_RLOCKED; 1319 INP_INFO_RLOCK(&V_tcbinfo); 1320 } 1321 } /** end while(m) */ 1322 } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */ 1323 if (in_pcbrele_wlocked(inp) == 0) 1324 INP_WUNLOCK(inp); 1325 if (ti_locked == TI_RLOCKED) 1326 INP_INFO_RUNLOCK(&V_tcbinfo); 1327 INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); 1328 INP_UNLOCK_ASSERT(inp); 1329 ti_locked = TI_UNLOCKED; 1330 mtx_lock(&hpts->p_mtx); 1331 hpts->p_inp = NULL; 1332 CURVNET_RESTORE(); 1333 } 1334 } 1335 1336 static int 1337 tcp_hpts_est_run(struct tcp_hpts_entry *hpts) 1338 { 1339 int32_t ticks_to_run; 1340 1341 if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) { 1342 ticks_to_run = hpts->p_curtick - hpts->p_prevtick; 1343 if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) { 1344 ticks_to_run = NUM_OF_HPTSI_SLOTS - 2; 1345 } 1346 } else { 1347 if (hpts->p_prevtick == hpts->p_curtick) { 1348 /* This happens when we get woken up right away */ 1349 return (-1); 1350 } 1351 ticks_to_run = 1; 1352 } 1353 /* Set in where we will be when we catch up */ 1354 hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS; 1355 if (hpts->p_nxt_slot == hpts->p_cur_slot) { 1356 panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d", 1357 hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run); 1358 } 1359 return (ticks_to_run); 1360 } 1361 1362 static void 1363 tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) 1364 { 1365 struct tcpcb *tp; 1366 struct inpcb *inp = NULL, *ninp; 1367 struct timeval tv; 1368 int32_t ticks_to_run, i, error, tick_now, interum_tick; 1369 int32_t paced_cnt = 0; 1370 int32_t did_prefetch = 0; 1371 int32_t prefetch_ninp = 0; 1372 int32_t prefetch_tp = 0; 1373 uint32_t cts; 1374 int16_t set_cpu; 1375 1376 HPTS_MTX_ASSERT(hpts); 1377 hpts->p_curtick = tcp_tv_to_hptstick(ctick); 1378 cts = tcp_tv_to_usectick(ctick); 1379 memcpy(&tv, ctick, sizeof(struct timeval)); 1380 hpts->p_cur_slot = hpts_tick(hpts, 1); 1381 1382 /* Figure out if we had missed ticks */ 1383 again: 1384 HPTS_MTX_ASSERT(hpts); 1385 ticks_to_run = tcp_hpts_est_run(hpts); 1386 if (!TAILQ_EMPTY(&hpts->p_input)) { 1387 tcp_input_data(hpts, &tv); 1388 } 1389 #ifdef INVARIANTS 1390 if (TAILQ_EMPTY(&hpts->p_input) && 1391 (hpts->p_on_inqueue_cnt != 0)) { 1392 panic("tp:%p in_hpts input empty but cnt:%d", 1393 hpts, hpts->p_on_inqueue_cnt); 1394 } 1395 #endif 1396 HPTS_MTX_ASSERT(hpts); 1397 /* Reset the ticks to run and time if we need too */ 1398 interum_tick = tcp_gethptstick(&tv); 1399 if (interum_tick != hpts->p_curtick) { 1400 /* Save off the new time we execute to */ 1401 *ctick = tv; 1402 hpts->p_curtick = interum_tick; 1403 cts = tcp_tv_to_usectick(&tv); 1404 hpts->p_cur_slot = hpts_tick(hpts, 1); 1405 ticks_to_run = tcp_hpts_est_run(hpts); 1406 } 1407 if (ticks_to_run == -1) { 1408 goto no_run; 1409 } 1410 if (logging_on) { 1411 tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0); 1412 } 1413 if (hpts->p_on_queue_cnt == 0) { 1414 goto no_one; 1415 } 1416 HPTS_MTX_ASSERT(hpts); 1417 for (i = 0; i < ticks_to_run; i++) { 1418 /* 1419 * Calculate our delay, if there are no extra ticks there 1420 * was not any 1421 */ 1422 hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC; 1423 HPTS_MTX_ASSERT(hpts); 1424 while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { 1425 /* For debugging */ 1426 if (logging_on) { 1427 tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i); 1428 } 1429 hpts->p_inp = inp; 1430 paced_cnt++; 1431 if (hpts->p_cur_slot != inp->inp_hptsslot) { 1432 panic("Hpts:%p inp:%p slot mis-aligned %u vs %u", 1433 hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot); 1434 } 1435 /* Now pull it */ 1436 if (inp->inp_hpts_cpu_set == 0) { 1437 set_cpu = 1; 1438 } else { 1439 set_cpu = 0; 1440 } 1441 hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0); 1442 if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { 1443 /* We prefetch the next inp if possible */ 1444 kern_prefetch(ninp, &prefetch_ninp); 1445 prefetch_ninp = 1; 1446 } 1447 if (inp->inp_hpts_request) { 1448 /* 1449 * This guy is deferred out further in time 1450 * then our wheel had on it. Push him back 1451 * on the wheel. 1452 */ 1453 int32_t remaining_slots; 1454 1455 remaining_slots = ticks_to_run - (i + 1); 1456 if (inp->inp_hpts_request > remaining_slots) { 1457 /* 1458 * Keep INVARIANTS happy by clearing 1459 * the flag 1460 */ 1461 tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1); 1462 hpts->p_inp = NULL; 1463 continue; 1464 } 1465 inp->inp_hpts_request = 0; 1466 } 1467 /* 1468 * We clear the hpts flag here after dealing with 1469 * remaining slots. This way anyone looking with the 1470 * TCB lock will see its on the hpts until just 1471 * before we unlock. 1472 */ 1473 inp->inp_in_hpts = 0; 1474 mtx_unlock(&hpts->p_mtx); 1475 INP_WLOCK(inp); 1476 if (in_pcbrele_wlocked(inp)) { 1477 mtx_lock(&hpts->p_mtx); 1478 if (logging_on) 1479 tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1); 1480 hpts->p_inp = NULL; 1481 continue; 1482 } 1483 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 1484 out_now: 1485 #ifdef INVARIANTS 1486 if (mtx_owned(&hpts->p_mtx)) { 1487 panic("Hpts:%p owns mtx prior-to lock line:%d", 1488 hpts, __LINE__); 1489 } 1490 #endif 1491 INP_WUNLOCK(inp); 1492 mtx_lock(&hpts->p_mtx); 1493 if (logging_on) 1494 tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3); 1495 hpts->p_inp = NULL; 1496 continue; 1497 } 1498 tp = intotcpcb(inp); 1499 if ((tp == NULL) || (tp->t_inpcb == NULL)) { 1500 goto out_now; 1501 } 1502 if (set_cpu) { 1503 /* 1504 * Setup so the next time we will move to 1505 * the right CPU. This should be a rare 1506 * event. It will sometimes happens when we 1507 * are the client side (usually not the 1508 * server). Somehow tcp_output() gets called 1509 * before the tcp_do_segment() sets the 1510 * intial state. This means the r_cpu and 1511 * r_hpts_cpu is 0. We get on the hpts, and 1512 * then tcp_input() gets called setting up 1513 * the r_cpu to the correct value. The hpts 1514 * goes off and sees the mis-match. We 1515 * simply correct it here and the CPU will 1516 * switch to the new hpts nextime the tcb 1517 * gets added to the the hpts (not this one) 1518 * :-) 1519 */ 1520 tcp_set_hpts(inp); 1521 } 1522 if (out_newts_every_tcb) { 1523 struct timeval sv; 1524 1525 if (out_ts_percision) 1526 microuptime(&sv); 1527 else 1528 getmicrouptime(&sv); 1529 cts = tcp_tv_to_usectick(&sv); 1530 } 1531 CURVNET_SET(inp->inp_vnet); 1532 /* 1533 * There is a hole here, we get the refcnt on the 1534 * inp so it will still be preserved but to make 1535 * sure we can get the INP we need to hold the p_mtx 1536 * above while we pull out the tp/inp, as long as 1537 * fini gets the lock first we are assured of having 1538 * a sane INP we can lock and test. 1539 */ 1540 #ifdef INVARIANTS 1541 if (mtx_owned(&hpts->p_mtx)) { 1542 panic("Hpts:%p owns mtx before tcp-output:%d", 1543 hpts, __LINE__); 1544 } 1545 #endif 1546 if (tp->t_fb_ptr != NULL) { 1547 kern_prefetch(tp->t_fb_ptr, &did_prefetch); 1548 did_prefetch = 1; 1549 } 1550 inp->inp_hpts_calls = 1; 1551 if (tp->t_fb->tfb_tcp_output_wtime != NULL) { 1552 error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv); 1553 } else { 1554 error = tp->t_fb->tfb_tcp_output(tp); 1555 } 1556 if (ninp && ninp->inp_ppcb) { 1557 /* 1558 * If we have a nxt inp, see if we can 1559 * prefetch its ppcb. Note this may seem 1560 * "risky" since we have no locks (other 1561 * than the previous inp) and there no 1562 * assurance that ninp was not pulled while 1563 * we were processing inp and freed. If this 1564 * occured it could mean that either: 1565 * 1566 * a) Its NULL (which is fine we won't go 1567 * here) <or> b) Its valid (which is cool we 1568 * will prefetch it) <or> c) The inp got 1569 * freed back to the slab which was 1570 * reallocated. Then the piece of memory was 1571 * re-used and something else (not an 1572 * address) is in inp_ppcb. If that occurs 1573 * we don't crash, but take a TLB shootdown 1574 * performance hit (same as if it was NULL 1575 * and we tried to pre-fetch it). 1576 * 1577 * Considering that the likelyhood of <c> is 1578 * quite rare we will take a risk on doing 1579 * this. If performance drops after testing 1580 * we can always take this out. NB: the 1581 * kern_prefetch on amd64 actually has 1582 * protection against a bad address now via 1583 * the DMAP_() tests. This will prevent the 1584 * TLB hit, and instead if <c> occurs just 1585 * cause us to load cache with a useless 1586 * address (to us). 1587 */ 1588 kern_prefetch(ninp->inp_ppcb, &prefetch_tp); 1589 prefetch_tp = 1; 1590 } 1591 INP_WUNLOCK(inp); 1592 INP_UNLOCK_ASSERT(inp); 1593 CURVNET_RESTORE(); 1594 #ifdef INVARIANTS 1595 if (mtx_owned(&hpts->p_mtx)) { 1596 panic("Hpts:%p owns mtx prior-to lock line:%d", 1597 hpts, __LINE__); 1598 } 1599 #endif 1600 mtx_lock(&hpts->p_mtx); 1601 if (logging_on) 1602 tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4); 1603 hpts->p_inp = NULL; 1604 } 1605 HPTS_MTX_ASSERT(hpts); 1606 hpts->p_inp = NULL; 1607 hpts->p_cur_slot++; 1608 if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) { 1609 hpts->p_cur_slot = 0; 1610 } 1611 } 1612 no_one: 1613 HPTS_MTX_ASSERT(hpts); 1614 hpts->p_prevtick = hpts->p_curtick; 1615 hpts->p_delayed_by = 0; 1616 /* 1617 * Check to see if we took an excess amount of time and need to run 1618 * more ticks (if we did not hit eno-bufs). 1619 */ 1620 /* Re-run any input that may be there */ 1621 (void)tcp_gethptstick(&tv); 1622 if (!TAILQ_EMPTY(&hpts->p_input)) { 1623 tcp_input_data(hpts, &tv); 1624 } 1625 #ifdef INVARIANTS 1626 if (TAILQ_EMPTY(&hpts->p_input) && 1627 (hpts->p_on_inqueue_cnt != 0)) { 1628 panic("tp:%p in_hpts input empty but cnt:%d", 1629 hpts, hpts->p_on_inqueue_cnt); 1630 } 1631 #endif 1632 tick_now = tcp_gethptstick(&tv); 1633 if (SEQ_GT(tick_now, hpts->p_prevtick)) { 1634 struct timeval res; 1635 1636 /* Did we really spend a full tick or more in here? */ 1637 timersub(&tv, ctick, &res); 1638 if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) { 1639 counter_u64_add(hpts_loops, 1); 1640 if (logging_on) { 1641 tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now); 1642 } 1643 *ctick = res; 1644 hpts->p_curtick = tick_now; 1645 goto again; 1646 } 1647 } 1648 no_run: 1649 { 1650 uint32_t t = 0, i, fnd = 0; 1651 1652 if (hpts->p_on_queue_cnt) { 1653 1654 1655 /* 1656 * Find next slot that is occupied and use that to 1657 * be the sleep time. 1658 */ 1659 for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) { 1660 if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) { 1661 fnd = 1; 1662 break; 1663 } 1664 t = (t + 1) % NUM_OF_HPTSI_SLOTS; 1665 } 1666 if (fnd) { 1667 hpts->p_hpts_sleep_time = i; 1668 } else { 1669 counter_u64_add(back_tosleep, 1); 1670 #ifdef INVARIANTS 1671 panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt); 1672 #endif 1673 hpts->p_on_queue_cnt = 0; 1674 goto non_found; 1675 } 1676 t++; 1677 } else { 1678 /* No one on the wheel sleep for all but 2 slots */ 1679 non_found: 1680 if (hpts_sleep_max == 0) 1681 hpts_sleep_max = 1; 1682 hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max); 1683 t = 0; 1684 } 1685 if (logging_on) { 1686 tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC)); 1687 } 1688 } 1689 } 1690 1691 void 1692 __tcp_set_hpts(struct inpcb *inp, int32_t line) 1693 { 1694 struct tcp_hpts_entry *hpts; 1695 1696 INP_WLOCK_ASSERT(inp); 1697 hpts = tcp_hpts_lock(inp); 1698 if ((inp->inp_in_hpts == 0) && 1699 (inp->inp_hpts_cpu_set == 0)) { 1700 inp->inp_hpts_cpu = hpts_cpuid(inp); 1701 inp->inp_hpts_cpu_set = 1; 1702 } 1703 mtx_unlock(&hpts->p_mtx); 1704 hpts = tcp_input_lock(inp); 1705 if ((inp->inp_input_cpu_set == 0) && 1706 (inp->inp_in_input == 0)) { 1707 inp->inp_input_cpu = hpts_cpuid(inp); 1708 inp->inp_input_cpu_set = 1; 1709 } 1710 mtx_unlock(&hpts->p_mtx); 1711 } 1712 1713 uint16_t 1714 tcp_hpts_delayedby(struct inpcb *inp){ 1715 return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by); 1716 } 1717 1718 static void 1719 tcp_hpts_thread(void *ctx) 1720 { 1721 struct tcp_hpts_entry *hpts; 1722 struct timeval tv; 1723 sbintime_t sb; 1724 1725 hpts = (struct tcp_hpts_entry *)ctx; 1726 mtx_lock(&hpts->p_mtx); 1727 if (hpts->p_direct_wake) { 1728 /* Signaled by input */ 1729 if (logging_on) 1730 tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1); 1731 callout_stop(&hpts->co); 1732 } else { 1733 /* Timed out */ 1734 if (callout_pending(&hpts->co) || 1735 !callout_active(&hpts->co)) { 1736 if (logging_on) 1737 tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2); 1738 mtx_unlock(&hpts->p_mtx); 1739 return; 1740 } 1741 callout_deactivate(&hpts->co); 1742 if (logging_on) 1743 tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3); 1744 } 1745 hpts->p_hpts_active = 1; 1746 (void)tcp_gethptstick(&tv); 1747 tcp_hptsi(hpts, &tv); 1748 HPTS_MTX_ASSERT(hpts); 1749 tv.tv_sec = 0; 1750 tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; 1751 if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) { 1752 tv.tv_usec = tcp_min_hptsi_time; 1753 hpts->p_on_min_sleep = 1; 1754 } else { 1755 /* Clear the min sleep flag */ 1756 hpts->p_on_min_sleep = 0; 1757 } 1758 hpts->p_hpts_active = 0; 1759 sb = tvtosbt(tv); 1760 if (tcp_hpts_callout_skip_swi == 0) { 1761 callout_reset_sbt_on(&hpts->co, sb, 0, 1762 hpts_timeout_swi, hpts, hpts->p_cpu, 1763 (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 1764 } else { 1765 callout_reset_sbt_on(&hpts->co, sb, 0, 1766 hpts_timeout_dir, hpts, 1767 hpts->p_cpu, 1768 C_PREL(tcp_hpts_precision)); 1769 } 1770 hpts->p_direct_wake = 0; 1771 mtx_unlock(&hpts->p_mtx); 1772 } 1773 1774 #undef timersub 1775 1776 static void 1777 tcp_init_hptsi(void *st) 1778 { 1779 int32_t i, j, error, bound = 0, created = 0; 1780 size_t sz, asz; 1781 struct timeval tv; 1782 sbintime_t sb; 1783 struct tcp_hpts_entry *hpts; 1784 char unit[16]; 1785 uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; 1786 1787 tcp_pace.rp_proc = NULL; 1788 tcp_pace.rp_num_hptss = ncpus; 1789 hpts_loops = counter_u64_alloc(M_WAITOK); 1790 back_tosleep = counter_u64_alloc(M_WAITOK); 1791 1792 sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); 1793 tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); 1794 asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; 1795 for (i = 0; i < tcp_pace.rp_num_hptss; i++) { 1796 tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), 1797 M_TCPHPTS, M_WAITOK | M_ZERO); 1798 tcp_pace.rp_ent[i]->p_hptss = malloc(asz, 1799 M_TCPHPTS, M_WAITOK); 1800 hpts = tcp_pace.rp_ent[i]; 1801 /* 1802 * Init all the hpts structures that are not specifically 1803 * zero'd by the allocations. Also lets attach them to the 1804 * appropriate sysctl block as well. 1805 */ 1806 mtx_init(&hpts->p_mtx, "tcp_hpts_lck", 1807 "hpts", MTX_DEF | MTX_DUPOK); 1808 TAILQ_INIT(&hpts->p_input); 1809 for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { 1810 TAILQ_INIT(&hpts->p_hptss[j]); 1811 } 1812 sysctl_ctx_init(&hpts->hpts_ctx); 1813 sprintf(unit, "%d", i); 1814 hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, 1815 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), 1816 OID_AUTO, 1817 unit, 1818 CTLFLAG_RW, 0, 1819 ""); 1820 SYSCTL_ADD_INT(&hpts->hpts_ctx, 1821 SYSCTL_CHILDREN(hpts->hpts_root), 1822 OID_AUTO, "in_qcnt", CTLFLAG_RD, 1823 &hpts->p_on_inqueue_cnt, 0, 1824 "Count TCB's awaiting input processing"); 1825 SYSCTL_ADD_INT(&hpts->hpts_ctx, 1826 SYSCTL_CHILDREN(hpts->hpts_root), 1827 OID_AUTO, "out_qcnt", CTLFLAG_RD, 1828 &hpts->p_on_queue_cnt, 0, 1829 "Count TCB's awaiting output processing"); 1830 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1831 SYSCTL_CHILDREN(hpts->hpts_root), 1832 OID_AUTO, "active", CTLFLAG_RD, 1833 &hpts->p_hpts_active, 0, 1834 "Is the hpts active"); 1835 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1836 SYSCTL_CHILDREN(hpts->hpts_root), 1837 OID_AUTO, "curslot", CTLFLAG_RD, 1838 &hpts->p_cur_slot, 0, 1839 "What the current slot is if active"); 1840 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1841 SYSCTL_CHILDREN(hpts->hpts_root), 1842 OID_AUTO, "curtick", CTLFLAG_RD, 1843 &hpts->p_curtick, 0, 1844 "What the current tick on if active"); 1845 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1846 SYSCTL_CHILDREN(hpts->hpts_root), 1847 OID_AUTO, "logsize", CTLFLAG_RD, 1848 &hpts->p_logsize, 0, 1849 "Hpts logging buffer size"); 1850 hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2; 1851 hpts->p_num = i; 1852 hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv); 1853 hpts->p_prevtick -= 1; 1854 hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS; 1855 hpts->p_cpu = 0xffff; 1856 hpts->p_nxt_slot = 1; 1857 hpts->p_logsize = tcp_hpts_logging_size; 1858 if (hpts->p_logsize) { 1859 sz = (sizeof(struct hpts_log) * hpts->p_logsize); 1860 hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); 1861 } 1862 callout_init(&hpts->co, 1); 1863 } 1864 /* 1865 * Now lets start ithreads to handle the hptss. 1866 */ 1867 CPU_FOREACH(i) { 1868 hpts = tcp_pace.rp_ent[i]; 1869 hpts->p_cpu = i; 1870 error = swi_add(&hpts->ie, "hpts", 1871 tcp_hpts_thread, (void *)hpts, 1872 SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); 1873 if (error) { 1874 panic("Can't add hpts:%p i:%d err:%d", 1875 hpts, i, error); 1876 } 1877 created++; 1878 if (tcp_bind_threads) { 1879 if (intr_event_bind(hpts->ie, i) == 0) 1880 bound++; 1881 } 1882 tv.tv_sec = 0; 1883 tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; 1884 sb = tvtosbt(tv); 1885 if (tcp_hpts_callout_skip_swi == 0) { 1886 callout_reset_sbt_on(&hpts->co, sb, 0, 1887 hpts_timeout_swi, hpts, hpts->p_cpu, 1888 (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 1889 } else { 1890 callout_reset_sbt_on(&hpts->co, sb, 0, 1891 hpts_timeout_dir, hpts, 1892 hpts->p_cpu, 1893 C_PREL(tcp_hpts_precision)); 1894 } 1895 } 1896 printf("TCP Hpts created %d swi interrupt thread and bound %d\n", 1897 created, bound); 1898 return; 1899 } 1900 1901 SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL); 1902 MODULE_VERSION(tcphpts, 1); 1903