1 /*- 2 * Copyright (c) 2016-2018 Netflix Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 29 #include "opt_inet.h" 30 #include "opt_inet6.h" 31 #include "opt_tcpdebug.h" 32 /** 33 * Some notes about usage. 34 * 35 * The tcp_hpts system is designed to provide a high precision timer 36 * system for tcp. Its main purpose is to provide a mechanism for 37 * pacing packets out onto the wire. It can be used in two ways 38 * by a given TCP stack (and those two methods can be used simultaneously). 39 * 40 * First, and probably the main thing its used by Rack and BBR for, it can 41 * be used to call tcp_output() of a transport stack at some time in the future. 42 * The normal way this is done is that tcp_output() of the stack schedules 43 * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The 44 * slot is the time from now that the stack wants to be called but it 45 * must be converted to tcp_hpts's notion of slot. This is done with 46 * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical 47 * call from the tcp_output() routine might look like: 48 * 49 * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550)); 50 * 51 * The above would schedule tcp_ouput() to be called in 550 useconds. 52 * Note that if using this mechanism the stack will want to add near 53 * its top a check to prevent unwanted calls (from user land or the 54 * arrival of incoming ack's). So it would add something like: 55 * 56 * if (inp->inp_in_hpts) 57 * return; 58 * 59 * to prevent output processing until the time alotted has gone by. 60 * Of course this is a bare bones example and the stack will probably 61 * have more consideration then just the above. 62 * 63 * Now the tcp_hpts system will call tcp_output in one of two forms, 64 * it will first check to see if the stack as defined a 65 * tfb_tcp_output_wtime() function, if so that is the routine it 66 * will call, if that function is not defined then it will call the 67 * tfb_tcp_output() function. The only difference between these 68 * two calls is that the former passes the time in to the function 69 * so the function does not have to access the time (which tcp_hpts 70 * already has). What these functions do is of course totally up 71 * to the individual tcp stack. 72 * 73 * Now the second function (actually two functions I guess :D) 74 * the tcp_hpts system provides is the ability to either abort 75 * a connection (later) or process input on a connection. 76 * Why would you want to do this? To keep processor locality. 77 * 78 * So in order to use the input redirection function the 79 * stack changes its tcp_do_segment() routine to instead 80 * of process the data call the function: 81 * 82 * tcp_queue_pkt_to_input() 83 * 84 * You will note that the arguments to this function look 85 * a lot like tcp_do_segments's arguments. This function 86 * will assure that the tcp_hpts system will 87 * call the functions tfb_tcp_hpts_do_segment() from the 88 * correct CPU. Note that multiple calls can get pushed 89 * into the tcp_hpts system this will be indicated by 90 * the next to last argument to tfb_tcp_hpts_do_segment() 91 * (nxt_pkt). If nxt_pkt is a 1 then another packet is 92 * coming. If nxt_pkt is a 0 then this is the last call 93 * that the tcp_hpts system has available for the tcp stack. 94 * 95 * The other point of the input system is to be able to safely 96 * drop a tcp connection without worrying about the recursive 97 * locking that may be occuring on the INP_WLOCK. So if 98 * a stack wants to drop a connection it calls: 99 * 100 * tcp_set_inp_to_drop(tp, ETIMEDOUT) 101 * 102 * To schedule the tcp_hpts system to call 103 * 104 * tcp_drop(tp, drop_reason) 105 * 106 * at a future point. This is quite handy to prevent locking 107 * issues when dropping connections. 108 * 109 */ 110 111 #include <sys/param.h> 112 #include <sys/bus.h> 113 #include <sys/interrupt.h> 114 #include <sys/module.h> 115 #include <sys/kernel.h> 116 #include <sys/hhook.h> 117 #include <sys/malloc.h> 118 #include <sys/mbuf.h> 119 #include <sys/proc.h> /* for proc0 declaration */ 120 #include <sys/socket.h> 121 #include <sys/socketvar.h> 122 #include <sys/sysctl.h> 123 #include <sys/systm.h> 124 #include <sys/refcount.h> 125 #include <sys/sched.h> 126 #include <sys/queue.h> 127 #include <sys/smp.h> 128 #include <sys/counter.h> 129 #include <sys/time.h> 130 #include <sys/kthread.h> 131 #include <sys/kern_prefetch.h> 132 133 #include <vm/uma.h> 134 135 #include <net/route.h> 136 #include <net/vnet.h> 137 138 #define TCPSTATES /* for logging */ 139 140 #include <netinet/in.h> 141 #include <netinet/in_kdtrace.h> 142 #include <netinet/in_pcb.h> 143 #include <netinet/ip.h> 144 #include <netinet/ip_icmp.h> /* required for icmp_var.h */ 145 #include <netinet/icmp_var.h> /* for ICMP_BANDLIM */ 146 #include <netinet/ip_var.h> 147 #include <netinet/ip6.h> 148 #include <netinet6/in6_pcb.h> 149 #include <netinet6/ip6_var.h> 150 #include <netinet/tcp.h> 151 #include <netinet/tcp_fsm.h> 152 #include <netinet/tcp_seq.h> 153 #include <netinet/tcp_timer.h> 154 #include <netinet/tcp_var.h> 155 #include <netinet/tcpip.h> 156 #include <netinet/cc/cc.h> 157 #include <netinet/tcp_hpts.h> 158 159 #ifdef tcpdebug 160 #include <netinet/tcp_debug.h> 161 #endif /* tcpdebug */ 162 #ifdef tcp_offload 163 #include <netinet/tcp_offload.h> 164 #endif 165 166 #include "opt_rss.h" 167 168 MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); 169 #ifdef RSS 170 #include <net/netisr.h> 171 #include <net/rss_config.h> 172 static int tcp_bind_threads = 1; 173 #else 174 static int tcp_bind_threads = 0; 175 #endif 176 TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); 177 178 static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG; 179 180 TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size); 181 182 static struct tcp_hptsi tcp_pace; 183 184 static void tcp_wakehpts(struct tcp_hpts_entry *p); 185 static void tcp_wakeinput(struct tcp_hpts_entry *p); 186 static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); 187 static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick); 188 static void tcp_hpts_thread(void *ctx); 189 static void tcp_init_hptsi(void *st); 190 191 int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP; 192 static int32_t tcp_hpts_callout_skip_swi = 0; 193 194 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls"); 195 196 #define timersub(tvp, uvp, vvp) \ 197 do { \ 198 (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \ 199 (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \ 200 if ((vvp)->tv_usec < 0) { \ 201 (vvp)->tv_sec--; \ 202 (vvp)->tv_usec += 1000000; \ 203 } \ 204 } while (0) 205 206 static int32_t logging_on = 0; 207 static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2); 208 static int32_t tcp_hpts_precision = 120; 209 210 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW, 211 &tcp_hpts_precision, 120, 212 "Value for PRE() precision of callout"); 213 214 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, 215 &logging_on, 0, 216 "Turn on logging if compiled in"); 217 218 counter_u64_t hpts_loops; 219 220 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD, 221 &hpts_loops, "Number of times hpts had to loop to catch up"); 222 223 counter_u64_t back_tosleep; 224 225 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD, 226 &back_tosleep, "Number of times hpts found no tcbs"); 227 228 static int32_t in_newts_every_tcb = 0; 229 230 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW, 231 &in_newts_every_tcb, 0, 232 "Do we have a new cts every tcb we process for input"); 233 static int32_t in_ts_percision = 0; 234 235 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW, 236 &in_ts_percision, 0, 237 "Do we use percise timestamp for clients on input"); 238 static int32_t out_newts_every_tcb = 0; 239 240 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW, 241 &out_newts_every_tcb, 0, 242 "Do we have a new cts every tcb we process for output"); 243 static int32_t out_ts_percision = 0; 244 245 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW, 246 &out_ts_percision, 0, 247 "Do we use a percise timestamp for every output cts"); 248 249 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW, 250 &hpts_sleep_max, 0, 251 "The maximum time the hpts will sleep <1 - 254>"); 252 253 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW, 254 &tcp_min_hptsi_time, 0, 255 "The minimum time the hpts must sleep before processing more slots"); 256 257 SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW, 258 &tcp_hpts_callout_skip_swi, 0, 259 "Do we have the callout call directly to the hpts?"); 260 261 static void 262 __tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot, 263 uint32_t ticknow, int32_t line) 264 { 265 struct hpts_log *pl; 266 267 HPTS_MTX_ASSERT(hpts); 268 if (hpts->p_log == NULL) 269 return; 270 pl = &hpts->p_log[hpts->p_log_at]; 271 hpts->p_log_at++; 272 if (hpts->p_log_at >= hpts->p_logsize) { 273 hpts->p_log_at = 0; 274 hpts->p_log_wrapped = 1; 275 } 276 pl->inp = inp; 277 if (inp) { 278 pl->t_paceslot = inp->inp_hptsslot; 279 pl->t_hptsreq = inp->inp_hpts_request; 280 pl->p_onhpts = inp->inp_in_hpts; 281 pl->p_oninput = inp->inp_in_input; 282 } else { 283 pl->t_paceslot = 0; 284 pl->t_hptsreq = 0; 285 pl->p_onhpts = 0; 286 pl->p_oninput = 0; 287 } 288 pl->is_notempty = 1; 289 pl->event = event; 290 pl->line = line; 291 pl->cts = tcp_get_usecs(NULL); 292 pl->p_curtick = hpts->p_curtick; 293 pl->p_prevtick = hpts->p_prevtick; 294 pl->p_on_queue_cnt = hpts->p_on_queue_cnt; 295 pl->ticknow = ticknow; 296 pl->slot_req = slot; 297 pl->p_nxt_slot = hpts->p_nxt_slot; 298 pl->p_cur_slot = hpts->p_cur_slot; 299 pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time; 300 pl->p_flags = (hpts->p_cpu & 0x7f); 301 pl->p_flags <<= 7; 302 pl->p_flags |= (hpts->p_num & 0x7f); 303 pl->p_flags <<= 2; 304 if (hpts->p_hpts_active) { 305 pl->p_flags |= HPTS_HPTS_ACTIVE; 306 } 307 } 308 309 #define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__) 310 311 static void 312 hpts_timeout_swi(void *arg) 313 { 314 struct tcp_hpts_entry *hpts; 315 316 hpts = (struct tcp_hpts_entry *)arg; 317 swi_sched(hpts->ie_cookie, 0); 318 } 319 320 static void 321 hpts_timeout_dir(void *arg) 322 { 323 tcp_hpts_thread(arg); 324 } 325 326 static inline void 327 hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear) 328 { 329 #ifdef INVARIANTS 330 if (mtx_owned(&hpts->p_mtx) == 0) { 331 /* We don't own the mutex? */ 332 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 333 } 334 if (hpts->p_cpu != inp->inp_hpts_cpu) { 335 /* It is not the right cpu/mutex? */ 336 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 337 } 338 if (inp->inp_in_hpts == 0) { 339 /* We are not on the hpts? */ 340 panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp); 341 } 342 if (TAILQ_EMPTY(head) && 343 (hpts->p_on_queue_cnt != 0)) { 344 /* We should not be empty with a queue count */ 345 panic("%s hpts:%p hpts bucket empty but cnt:%d", 346 __FUNCTION__, hpts, hpts->p_on_queue_cnt); 347 } 348 #endif 349 TAILQ_REMOVE(head, inp, inp_hpts); 350 hpts->p_on_queue_cnt--; 351 if (hpts->p_on_queue_cnt < 0) { 352 /* Count should not go negative .. */ 353 #ifdef INVARIANTS 354 panic("Hpts goes negative inp:%p hpts:%p", 355 inp, hpts); 356 #endif 357 hpts->p_on_queue_cnt = 0; 358 } 359 if (clear) { 360 inp->inp_hpts_request = 0; 361 inp->inp_in_hpts = 0; 362 } 363 } 364 365 static inline void 366 hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref) 367 { 368 #ifdef INVARIANTS 369 if (mtx_owned(&hpts->p_mtx) == 0) { 370 /* We don't own the mutex? */ 371 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 372 } 373 if (hpts->p_cpu != inp->inp_hpts_cpu) { 374 /* It is not the right cpu/mutex? */ 375 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 376 } 377 if ((noref == 0) && (inp->inp_in_hpts == 1)) { 378 /* We are already on the hpts? */ 379 panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp); 380 } 381 #endif 382 TAILQ_INSERT_TAIL(head, inp, inp_hpts); 383 inp->inp_in_hpts = 1; 384 hpts->p_on_queue_cnt++; 385 if (noref == 0) { 386 in_pcbref(inp); 387 } 388 } 389 390 static inline void 391 hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear) 392 { 393 #ifdef INVARIANTS 394 if (mtx_owned(&hpts->p_mtx) == 0) { 395 /* We don't own the mutex? */ 396 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 397 } 398 if (hpts->p_cpu != inp->inp_input_cpu) { 399 /* It is not the right cpu/mutex? */ 400 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 401 } 402 if (inp->inp_in_input == 0) { 403 /* We are not on the input hpts? */ 404 panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp); 405 } 406 #endif 407 TAILQ_REMOVE(&hpts->p_input, inp, inp_input); 408 hpts->p_on_inqueue_cnt--; 409 if (hpts->p_on_inqueue_cnt < 0) { 410 #ifdef INVARIANTS 411 panic("Hpts in goes negative inp:%p hpts:%p", 412 inp, hpts); 413 #endif 414 hpts->p_on_inqueue_cnt = 0; 415 } 416 #ifdef INVARIANTS 417 if (TAILQ_EMPTY(&hpts->p_input) && 418 (hpts->p_on_inqueue_cnt != 0)) { 419 /* We should not be empty with a queue count */ 420 panic("%s hpts:%p in_hpts input empty but cnt:%d", 421 __FUNCTION__, hpts, hpts->p_on_inqueue_cnt); 422 } 423 #endif 424 if (clear) 425 inp->inp_in_input = 0; 426 } 427 428 static inline void 429 hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line) 430 { 431 #ifdef INVARIANTS 432 if (mtx_owned(&hpts->p_mtx) == 0) { 433 /* We don't own the mutex? */ 434 panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp); 435 } 436 if (hpts->p_cpu != inp->inp_input_cpu) { 437 /* It is not the right cpu/mutex? */ 438 panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp); 439 } 440 if (inp->inp_in_input == 1) { 441 /* We are already on the input hpts? */ 442 panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp); 443 } 444 #endif 445 TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input); 446 inp->inp_in_input = 1; 447 hpts->p_on_inqueue_cnt++; 448 in_pcbref(inp); 449 } 450 451 static int 452 sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS) 453 { 454 struct tcp_hpts_entry *hpts; 455 size_t sz; 456 int32_t logging_was, i; 457 int32_t error = 0; 458 459 /* 460 * HACK: Turn off logging so no locks are required this really needs 461 * a memory barrier :) 462 */ 463 logging_was = logging_on; 464 logging_on = 0; 465 if (!req->oldptr) { 466 /* How much? */ 467 sz = 0; 468 for (i = 0; i < tcp_pace.rp_num_hptss; i++) { 469 hpts = tcp_pace.rp_ent[i]; 470 if (hpts->p_log == NULL) 471 continue; 472 sz += (sizeof(struct hpts_log) * hpts->p_logsize); 473 } 474 error = SYSCTL_OUT(req, 0, sz); 475 } else { 476 for (i = 0; i < tcp_pace.rp_num_hptss; i++) { 477 hpts = tcp_pace.rp_ent[i]; 478 if (hpts->p_log == NULL) 479 continue; 480 if (hpts->p_log_wrapped) 481 sz = (sizeof(struct hpts_log) * hpts->p_logsize); 482 else 483 sz = (sizeof(struct hpts_log) * hpts->p_log_at); 484 error = SYSCTL_OUT(req, hpts->p_log, sz); 485 } 486 } 487 logging_on = logging_was; 488 return error; 489 } 490 491 SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 492 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log"); 493 494 495 static void 496 tcp_wakehpts(struct tcp_hpts_entry *hpts) 497 { 498 HPTS_MTX_ASSERT(hpts); 499 swi_sched(hpts->ie_cookie, 0); 500 if (hpts->p_hpts_active == 2) { 501 /* Rare sleeping on a ENOBUF */ 502 wakeup_one(hpts); 503 } 504 } 505 506 static void 507 tcp_wakeinput(struct tcp_hpts_entry *hpts) 508 { 509 HPTS_MTX_ASSERT(hpts); 510 swi_sched(hpts->ie_cookie, 0); 511 if (hpts->p_hpts_active == 2) { 512 /* Rare sleeping on a ENOBUF */ 513 wakeup_one(hpts); 514 } 515 } 516 517 struct tcp_hpts_entry * 518 tcp_cur_hpts(struct inpcb *inp) 519 { 520 int32_t hpts_num; 521 struct tcp_hpts_entry *hpts; 522 523 hpts_num = inp->inp_hpts_cpu; 524 hpts = tcp_pace.rp_ent[hpts_num]; 525 return (hpts); 526 } 527 528 struct tcp_hpts_entry * 529 tcp_hpts_lock(struct inpcb *inp) 530 { 531 struct tcp_hpts_entry *hpts; 532 int32_t hpts_num; 533 534 again: 535 hpts_num = inp->inp_hpts_cpu; 536 hpts = tcp_pace.rp_ent[hpts_num]; 537 #ifdef INVARIANTS 538 if (mtx_owned(&hpts->p_mtx)) { 539 panic("Hpts:%p owns mtx prior-to lock line:%d", 540 hpts, __LINE__); 541 } 542 #endif 543 mtx_lock(&hpts->p_mtx); 544 if (hpts_num != inp->inp_hpts_cpu) { 545 mtx_unlock(&hpts->p_mtx); 546 goto again; 547 } 548 return (hpts); 549 } 550 551 struct tcp_hpts_entry * 552 tcp_input_lock(struct inpcb *inp) 553 { 554 struct tcp_hpts_entry *hpts; 555 int32_t hpts_num; 556 557 again: 558 hpts_num = inp->inp_input_cpu; 559 hpts = tcp_pace.rp_ent[hpts_num]; 560 #ifdef INVARIANTS 561 if (mtx_owned(&hpts->p_mtx)) { 562 panic("Hpts:%p owns mtx prior-to lock line:%d", 563 hpts, __LINE__); 564 } 565 #endif 566 mtx_lock(&hpts->p_mtx); 567 if (hpts_num != inp->inp_input_cpu) { 568 mtx_unlock(&hpts->p_mtx); 569 goto again; 570 } 571 return (hpts); 572 } 573 574 static void 575 tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line) 576 { 577 int32_t add_freed; 578 579 if (inp->inp_flags2 & INP_FREED) { 580 /* 581 * Need to play a special trick so that in_pcbrele_wlocked 582 * does not return 1 when it really should have returned 0. 583 */ 584 add_freed = 1; 585 inp->inp_flags2 &= ~INP_FREED; 586 } else { 587 add_freed = 0; 588 } 589 #ifndef INP_REF_DEBUG 590 if (in_pcbrele_wlocked(inp)) { 591 /* 592 * This should not happen. We have the inpcb referred to by 593 * the main socket (why we are called) and the hpts. It 594 * should always return 0. 595 */ 596 panic("inpcb:%p release ret 1", 597 inp); 598 } 599 #else 600 if (__in_pcbrele_wlocked(inp, line)) { 601 /* 602 * This should not happen. We have the inpcb referred to by 603 * the main socket (why we are called) and the hpts. It 604 * should always return 0. 605 */ 606 panic("inpcb:%p release ret 1", 607 inp); 608 } 609 #endif 610 if (add_freed) { 611 inp->inp_flags2 |= INP_FREED; 612 } 613 } 614 615 static void 616 tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) 617 { 618 if (inp->inp_in_hpts) { 619 hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1); 620 tcp_remove_hpts_ref(inp, hpts, line); 621 } 622 } 623 624 static void 625 tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line) 626 { 627 HPTS_MTX_ASSERT(hpts); 628 if (inp->inp_in_input) { 629 hpts_sane_input_remove(hpts, inp, 1); 630 tcp_remove_hpts_ref(inp, hpts, line); 631 } 632 } 633 634 /* 635 * Called normally with the INP_LOCKED but it 636 * does not matter, the hpts lock is the key 637 * but the lock order allows us to hold the 638 * INP lock and then get the hpts lock. 639 * 640 * Valid values in the flags are 641 * HPTS_REMOVE_OUTPUT - remove from the output of the hpts. 642 * HPTS_REMOVE_INPUT - remove from the input of the hpts. 643 * Note that you can or both values together and get two 644 * actions. 645 */ 646 void 647 __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line) 648 { 649 struct tcp_hpts_entry *hpts; 650 651 INP_WLOCK_ASSERT(inp); 652 if (flags & HPTS_REMOVE_OUTPUT) { 653 hpts = tcp_hpts_lock(inp); 654 tcp_hpts_remove_locked_output(hpts, inp, flags, line); 655 mtx_unlock(&hpts->p_mtx); 656 } 657 if (flags & HPTS_REMOVE_INPUT) { 658 hpts = tcp_input_lock(inp); 659 tcp_hpts_remove_locked_input(hpts, inp, flags, line); 660 mtx_unlock(&hpts->p_mtx); 661 } 662 } 663 664 static inline int 665 hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus) 666 { 667 return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS); 668 } 669 670 static int 671 tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref) 672 { 673 int32_t need_wake = 0; 674 uint32_t ticknow = 0; 675 676 HPTS_MTX_ASSERT(hpts); 677 if (inp->inp_in_hpts == 0) { 678 /* Ok we need to set it on the hpts in the current slot */ 679 if (hpts->p_hpts_active == 0) { 680 /* A sleeping hpts we want in next slot to run */ 681 if (logging_on) { 682 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0, 683 hpts_tick(hpts, 1)); 684 } 685 inp->inp_hptsslot = hpts_tick(hpts, 1); 686 inp->inp_hpts_request = 0; 687 if (logging_on) { 688 tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow); 689 } 690 need_wake = 1; 691 } else if ((void *)inp == hpts->p_inp) { 692 /* 693 * We can't allow you to go into the same slot we 694 * are in. We must put you out. 695 */ 696 inp->inp_hptsslot = hpts->p_nxt_slot; 697 } else 698 inp->inp_hptsslot = hpts->p_cur_slot; 699 hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); 700 inp->inp_hpts_request = 0; 701 if (logging_on) { 702 tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0); 703 } 704 if (need_wake) { 705 /* 706 * Activate the hpts if it is sleeping and its 707 * timeout is not 1. 708 */ 709 if (logging_on) { 710 tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow); 711 } 712 hpts->p_direct_wake = 1; 713 tcp_wakehpts(hpts); 714 } 715 } 716 return (need_wake); 717 } 718 719 int 720 __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line) 721 { 722 int32_t ret; 723 struct tcp_hpts_entry *hpts; 724 725 INP_WLOCK_ASSERT(inp); 726 hpts = tcp_hpts_lock(inp); 727 ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); 728 mtx_unlock(&hpts->p_mtx); 729 return (ret); 730 } 731 732 static void 733 tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line, 734 struct hpts_diag *diag, int32_t noref) 735 { 736 int32_t need_new_to = 0; 737 int32_t need_wakeup = 0; 738 uint32_t largest_slot; 739 uint32_t ticknow = 0; 740 uint32_t slot_calc; 741 742 HPTS_MTX_ASSERT(hpts); 743 if (diag) { 744 memset(diag, 0, sizeof(struct hpts_diag)); 745 diag->p_hpts_active = hpts->p_hpts_active; 746 diag->p_nxt_slot = hpts->p_nxt_slot; 747 diag->p_cur_slot = hpts->p_cur_slot; 748 diag->slot_req = slot; 749 } 750 if ((inp->inp_in_hpts == 0) || noref) { 751 inp->inp_hpts_request = slot; 752 if (slot == 0) { 753 /* Immediate */ 754 tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref); 755 return; 756 } 757 if (hpts->p_hpts_active) { 758 /* 759 * Its slot - 1 since nxt_slot is the next tick that 760 * will go off since the hpts is awake 761 */ 762 if (logging_on) { 763 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0); 764 } 765 /* 766 * We want to make sure that we don't place a inp in 767 * the range of p_cur_slot <-> p_nxt_slot. If we 768 * take from p_nxt_slot to the end, plus p_cur_slot 769 * and then take away 2, we will know how many is 770 * the max slots we can use. 771 */ 772 if (hpts->p_nxt_slot > hpts->p_cur_slot) { 773 /* 774 * Non-wrap case nxt_slot <-> cur_slot we 775 * don't want to land in. So the diff gives 776 * us what is taken away from the number of 777 * slots. 778 */ 779 largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot); 780 } else if (hpts->p_nxt_slot == hpts->p_cur_slot) { 781 largest_slot = NUM_OF_HPTSI_SLOTS - 2; 782 } else { 783 /* 784 * Wrap case so the diff gives us the number 785 * of slots that we can land in. 786 */ 787 largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot; 788 } 789 /* 790 * We take away two so we never have a problem (20 791 * usec's) out of 1024000 usecs 792 */ 793 largest_slot -= 2; 794 if (inp->inp_hpts_request > largest_slot) { 795 /* 796 * Restrict max jump of slots and remember 797 * leftover 798 */ 799 slot = largest_slot; 800 inp->inp_hpts_request -= largest_slot; 801 } else { 802 /* This one will run when we hit it */ 803 inp->inp_hpts_request = 0; 804 } 805 if (hpts->p_nxt_slot == hpts->p_cur_slot) 806 slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS; 807 else 808 slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS; 809 if (slot_calc == hpts->p_cur_slot) { 810 #ifdef INVARIANTS 811 /* TSNH */ 812 panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n", 813 hpts, slot_calc, slot, largest_slot); 814 #endif 815 if (slot_calc) 816 slot_calc--; 817 else 818 slot_calc = NUM_OF_HPTSI_SLOTS - 1; 819 } 820 inp->inp_hptsslot = slot_calc; 821 if (diag) { 822 diag->inp_hptsslot = inp->inp_hptsslot; 823 } 824 } else { 825 /* 826 * The hpts is sleeping, we need to figure out where 827 * it will wake up at and if we need to reschedule 828 * its time-out. 829 */ 830 uint32_t have_slept, yet_to_sleep; 831 uint32_t slot_now; 832 struct timeval tv; 833 834 ticknow = tcp_gethptstick(&tv); 835 slot_now = ticknow % NUM_OF_HPTSI_SLOTS; 836 /* 837 * The user wants to be inserted at (slot_now + 838 * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up. 839 */ 840 largest_slot = NUM_OF_HPTSI_SLOTS - 2; 841 if (inp->inp_hpts_request > largest_slot) { 842 /* Adjust the residual in inp_hpts_request */ 843 slot = largest_slot; 844 inp->inp_hpts_request -= largest_slot; 845 } else { 846 /* No residual it all fits */ 847 inp->inp_hpts_request = 0; 848 } 849 inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS; 850 if (diag) { 851 diag->slot_now = slot_now; 852 diag->inp_hptsslot = inp->inp_hptsslot; 853 diag->p_on_min_sleep = hpts->p_on_min_sleep; 854 } 855 if (logging_on) { 856 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow); 857 } 858 /* Now do we need to restart the hpts's timer? */ 859 if (TSTMP_GT(ticknow, hpts->p_curtick)) 860 have_slept = ticknow - hpts->p_curtick; 861 else 862 have_slept = 0; 863 if (have_slept < hpts->p_hpts_sleep_time) { 864 /* This should be what happens */ 865 yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; 866 } else { 867 /* We are over-due */ 868 yet_to_sleep = 0; 869 need_wakeup = 1; 870 } 871 if (diag) { 872 diag->have_slept = have_slept; 873 diag->yet_to_sleep = yet_to_sleep; 874 diag->hpts_sleep_time = hpts->p_hpts_sleep_time; 875 } 876 if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) { 877 /* 878 * We need to reschedule the hptss time-out. 879 */ 880 hpts->p_hpts_sleep_time = slot; 881 need_new_to = slot * HPTS_TICKS_PER_USEC; 882 } 883 } 884 hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); 885 if (logging_on) { 886 tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow); 887 } 888 /* 889 * Now how far is the hpts sleeping to? if active is 1, its 890 * up and ticking we do nothing, otherwise we may need to 891 * reschedule its callout if need_new_to is set from above. 892 */ 893 if (need_wakeup) { 894 if (logging_on) { 895 tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0); 896 } 897 hpts->p_direct_wake = 1; 898 tcp_wakehpts(hpts); 899 if (diag) { 900 diag->need_new_to = 0; 901 diag->co_ret = 0xffff0000; 902 } 903 } else if (need_new_to) { 904 int32_t co_ret; 905 struct timeval tv; 906 sbintime_t sb; 907 908 tv.tv_sec = 0; 909 tv.tv_usec = 0; 910 while (need_new_to > HPTS_USEC_IN_SEC) { 911 tv.tv_sec++; 912 need_new_to -= HPTS_USEC_IN_SEC; 913 } 914 tv.tv_usec = need_new_to; 915 sb = tvtosbt(tv); 916 if (tcp_hpts_callout_skip_swi == 0) { 917 co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, 918 hpts_timeout_swi, hpts, hpts->p_cpu, 919 (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 920 } else { 921 co_ret = callout_reset_sbt_on(&hpts->co, sb, 0, 922 hpts_timeout_dir, hpts, 923 hpts->p_cpu, 924 C_PREL(tcp_hpts_precision)); 925 } 926 if (diag) { 927 diag->need_new_to = need_new_to; 928 diag->co_ret = co_ret; 929 } 930 } 931 } else { 932 #ifdef INVARIANTS 933 panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp); 934 #endif 935 } 936 } 937 938 uint32_t 939 tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){ 940 struct tcp_hpts_entry *hpts; 941 uint32_t slot_on, cts; 942 struct timeval tv; 943 944 /* 945 * We now return the next-slot the hpts will be on, beyond its 946 * current run (if up) or where it was when it stopped if it is 947 * sleeping. 948 */ 949 INP_WLOCK_ASSERT(inp); 950 hpts = tcp_hpts_lock(inp); 951 if (in_ts_percision) 952 microuptime(&tv); 953 else 954 getmicrouptime(&tv); 955 cts = tcp_tv_to_usectick(&tv); 956 tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0); 957 slot_on = hpts->p_nxt_slot; 958 mtx_unlock(&hpts->p_mtx); 959 return (slot_on); 960 } 961 962 uint32_t 963 __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){ 964 return (tcp_hpts_insert_diag(inp, slot, line, NULL)); 965 } 966 967 int 968 __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line) 969 { 970 int32_t retval = 0; 971 972 HPTS_MTX_ASSERT(hpts); 973 if (inp->inp_in_input == 0) { 974 /* Ok we need to set it on the hpts in the current slot */ 975 hpts_sane_input_insert(hpts, inp, line); 976 retval = 1; 977 if (hpts->p_hpts_active == 0) { 978 /* 979 * Activate the hpts if it is sleeping. 980 */ 981 if (logging_on) { 982 tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0); 983 } 984 retval = 2; 985 hpts->p_direct_wake = 1; 986 tcp_wakeinput(hpts); 987 } 988 } else if (hpts->p_hpts_active == 0) { 989 retval = 4; 990 hpts->p_direct_wake = 1; 991 tcp_wakeinput(hpts); 992 } 993 return (retval); 994 } 995 996 void 997 tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 998 int32_t tlen, int32_t drop_hdrlen, uint8_t iptos) 999 { 1000 /* Setup packet for input first */ 1001 INP_WLOCK_ASSERT(tp->t_inpcb); 1002 m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t)); 1003 m->m_pkthdr.pace_tlen = (uint16_t) tlen; 1004 m->m_pkthdr.pace_drphdrlen = drop_hdrlen; 1005 m->m_pkthdr.pace_tos = iptos; 1006 m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0); 1007 if (tp->t_in_pkt == NULL) { 1008 tp->t_in_pkt = m; 1009 tp->t_tail_pkt = m; 1010 } else { 1011 tp->t_tail_pkt->m_nextpkt = m; 1012 tp->t_tail_pkt = m; 1013 } 1014 } 1015 1016 1017 int32_t 1018 __tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, 1019 int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){ 1020 struct tcp_hpts_entry *hpts; 1021 int32_t ret; 1022 1023 tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos); 1024 hpts = tcp_input_lock(tp->t_inpcb); 1025 ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line); 1026 mtx_unlock(&hpts->p_mtx); 1027 return (ret); 1028 } 1029 1030 void 1031 __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line) 1032 { 1033 struct tcp_hpts_entry *hpts; 1034 struct tcpcb *tp; 1035 1036 tp = intotcpcb(inp); 1037 hpts = tcp_input_lock(tp->t_inpcb); 1038 if (inp->inp_in_input == 0) { 1039 /* Ok we need to set it on the hpts in the current slot */ 1040 hpts_sane_input_insert(hpts, inp, line); 1041 if (hpts->p_hpts_active == 0) { 1042 /* 1043 * Activate the hpts if it is sleeping. 1044 */ 1045 hpts->p_direct_wake = 1; 1046 tcp_wakeinput(hpts); 1047 } 1048 } else if (hpts->p_hpts_active == 0) { 1049 hpts->p_direct_wake = 1; 1050 tcp_wakeinput(hpts); 1051 } 1052 inp->inp_hpts_drop_reas = reason; 1053 mtx_unlock(&hpts->p_mtx); 1054 } 1055 1056 static uint16_t 1057 hpts_random_cpu(struct inpcb *inp){ 1058 /* 1059 * No flow type set distribute the load randomly. 1060 */ 1061 uint16_t cpuid; 1062 uint32_t ran; 1063 1064 /* 1065 * If one has been set use it i.e. we want both in and out on the 1066 * same hpts. 1067 */ 1068 if (inp->inp_input_cpu_set) { 1069 return (inp->inp_input_cpu); 1070 } else if (inp->inp_hpts_cpu_set) { 1071 return (inp->inp_hpts_cpu); 1072 } 1073 /* Nothing set use a random number */ 1074 ran = arc4random(); 1075 cpuid = (ran & 0xffff) % mp_ncpus; 1076 return (cpuid); 1077 } 1078 1079 static uint16_t 1080 hpts_cpuid(struct inpcb *inp){ 1081 u_int cpuid; 1082 1083 1084 /* 1085 * If one has been set use it i.e. we want both in and out on the 1086 * same hpts. 1087 */ 1088 if (inp->inp_input_cpu_set) { 1089 return (inp->inp_input_cpu); 1090 } else if (inp->inp_hpts_cpu_set) { 1091 return (inp->inp_hpts_cpu); 1092 } 1093 /* If one is set the other must be the same */ 1094 #ifdef RSS 1095 cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype); 1096 if (cpuid == NETISR_CPUID_NONE) 1097 return (hpts_random_cpu(inp)); 1098 else 1099 return (cpuid); 1100 #else 1101 /* 1102 * We don't have a flowid -> cpuid mapping, so cheat and just map 1103 * unknown cpuids to curcpu. Not the best, but apparently better 1104 * than defaulting to swi 0. 1105 */ 1106 if (inp->inp_flowtype != M_HASHTYPE_NONE) { 1107 cpuid = inp->inp_flowid % mp_ncpus; 1108 return (cpuid); 1109 } 1110 cpuid = hpts_random_cpu(inp); 1111 return (cpuid); 1112 #endif 1113 } 1114 1115 /* 1116 * Do NOT try to optimize the processing of inp's 1117 * by first pulling off all the inp's into a temporary 1118 * list (e.g. TAILQ_CONCAT). If you do that the subtle 1119 * interactions of switching CPU's will kill because of 1120 * problems in the linked list manipulation. Basically 1121 * you would switch cpu's with the hpts mutex locked 1122 * but then while you were processing one of the inp's 1123 * some other one that you switch will get a new 1124 * packet on the different CPU. It will insert it 1125 * on the new hptss input list. Creating a temporary 1126 * link in the inp will not fix it either, since 1127 * the other hpts will be doing the same thing and 1128 * you will both end up using the temporary link. 1129 * 1130 * You will die in an ASSERT for tailq corruption if you 1131 * run INVARIANTS or you will die horribly without 1132 * INVARIANTS in some unknown way with a corrupt linked 1133 * list. 1134 */ 1135 static void 1136 tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) 1137 { 1138 struct mbuf *m, *n; 1139 struct tcpcb *tp; 1140 struct inpcb *inp; 1141 uint16_t drop_reason; 1142 int16_t set_cpu; 1143 uint32_t did_prefetch = 0; 1144 int32_t ti_locked = TI_UNLOCKED; 1145 struct epoch_tracker et; 1146 1147 HPTS_MTX_ASSERT(hpts); 1148 while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { 1149 HPTS_MTX_ASSERT(hpts); 1150 hpts_sane_input_remove(hpts, inp, 0); 1151 if (inp->inp_input_cpu_set == 0) { 1152 set_cpu = 1; 1153 } else { 1154 set_cpu = 0; 1155 } 1156 hpts->p_inp = inp; 1157 drop_reason = inp->inp_hpts_drop_reas; 1158 inp->inp_in_input = 0; 1159 mtx_unlock(&hpts->p_mtx); 1160 CURVNET_SET(inp->inp_vnet); 1161 if (drop_reason) { 1162 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1163 ti_locked = TI_RLOCKED; 1164 } else { 1165 ti_locked = TI_UNLOCKED; 1166 } 1167 INP_WLOCK(inp); 1168 if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || 1169 (inp->inp_flags2 & INP_FREED)) { 1170 out: 1171 hpts->p_inp = NULL; 1172 if (ti_locked == TI_RLOCKED) { 1173 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1174 } 1175 if (in_pcbrele_wlocked(inp) == 0) { 1176 INP_WUNLOCK(inp); 1177 } 1178 ti_locked = TI_UNLOCKED; 1179 CURVNET_RESTORE(); 1180 mtx_lock(&hpts->p_mtx); 1181 continue; 1182 } 1183 tp = intotcpcb(inp); 1184 if ((tp == NULL) || (tp->t_inpcb == NULL)) { 1185 goto out; 1186 } 1187 if (drop_reason) { 1188 /* This tcb is being destroyed for drop_reason */ 1189 m = tp->t_in_pkt; 1190 if (m) 1191 n = m->m_nextpkt; 1192 else 1193 n = NULL; 1194 tp->t_in_pkt = NULL; 1195 while (m) { 1196 m_freem(m); 1197 m = n; 1198 if (m) 1199 n = m->m_nextpkt; 1200 } 1201 tp = tcp_drop(tp, drop_reason); 1202 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1203 if (tp == NULL) { 1204 INP_WLOCK(inp); 1205 } 1206 if (in_pcbrele_wlocked(inp) == 0) 1207 INP_WUNLOCK(inp); 1208 CURVNET_RESTORE(); 1209 mtx_lock(&hpts->p_mtx); 1210 continue; 1211 } 1212 if (set_cpu) { 1213 /* 1214 * Setup so the next time we will move to the right 1215 * CPU. This should be a rare event. It will 1216 * sometimes happens when we are the client side 1217 * (usually not the server). Somehow tcp_output() 1218 * gets called before the tcp_do_segment() sets the 1219 * intial state. This means the r_cpu and r_hpts_cpu 1220 * is 0. We get on the hpts, and then tcp_input() 1221 * gets called setting up the r_cpu to the correct 1222 * value. The hpts goes off and sees the mis-match. 1223 * We simply correct it here and the CPU will switch 1224 * to the new hpts nextime the tcb gets added to the 1225 * the hpts (not this time) :-) 1226 */ 1227 tcp_set_hpts(inp); 1228 } 1229 m = tp->t_in_pkt; 1230 n = NULL; 1231 if (m != NULL && 1232 (m->m_pkthdr.pace_lock == TI_RLOCKED || 1233 tp->t_state != TCPS_ESTABLISHED)) { 1234 ti_locked = TI_RLOCKED; 1235 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1236 m = tp->t_in_pkt; 1237 } 1238 if (in_newts_every_tcb) { 1239 if (in_ts_percision) 1240 microuptime(tv); 1241 else 1242 getmicrouptime(tv); 1243 } 1244 if (tp->t_fb_ptr != NULL) { 1245 kern_prefetch(tp->t_fb_ptr, &did_prefetch); 1246 did_prefetch = 1; 1247 } 1248 /* Any input work to do, if so do it first */ 1249 if ((m != NULL) && (m == tp->t_in_pkt)) { 1250 struct tcphdr *th; 1251 int32_t tlen, drop_hdrlen, nxt_pkt; 1252 uint8_t iptos; 1253 1254 n = m->m_nextpkt; 1255 tp->t_in_pkt = tp->t_tail_pkt = NULL; 1256 while (m) { 1257 th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff); 1258 tlen = m->m_pkthdr.pace_tlen; 1259 drop_hdrlen = m->m_pkthdr.pace_drphdrlen; 1260 iptos = m->m_pkthdr.pace_tos; 1261 m->m_nextpkt = NULL; 1262 if (n) 1263 nxt_pkt = 1; 1264 else 1265 nxt_pkt = 0; 1266 inp->inp_input_calls = 1; 1267 if (tp->t_fb->tfb_tcp_hpts_do_segment) { 1268 /* Use the hpts specific do_segment */ 1269 (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket, 1270 tp, drop_hdrlen, 1271 tlen, iptos, nxt_pkt, tv); 1272 } else { 1273 /* Use the default do_segment */ 1274 (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket, 1275 tp, drop_hdrlen, 1276 tlen, iptos); 1277 } 1278 if (ti_locked == TI_RLOCKED) 1279 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1280 /* 1281 * Do segment returns unlocked we need the 1282 * lock again but we also need some kasserts 1283 * here. 1284 */ 1285 INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); 1286 INP_UNLOCK_ASSERT(inp); 1287 m = n; 1288 if (m) 1289 n = m->m_nextpkt; 1290 if (m != NULL && 1291 m->m_pkthdr.pace_lock == TI_RLOCKED) { 1292 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1293 ti_locked = TI_RLOCKED; 1294 } else 1295 ti_locked = TI_UNLOCKED; 1296 INP_WLOCK(inp); 1297 /* 1298 * Since we have an opening here we must 1299 * re-check if the tcb went away while we 1300 * were getting the lock(s). 1301 */ 1302 if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || 1303 (inp->inp_flags2 & INP_FREED)) { 1304 while (m) { 1305 m_freem(m); 1306 m = n; 1307 if (m) 1308 n = m->m_nextpkt; 1309 } 1310 goto out; 1311 } 1312 /* 1313 * Now that we hold the INP lock, check if 1314 * we need to upgrade our lock. 1315 */ 1316 if (ti_locked == TI_UNLOCKED && 1317 (tp->t_state != TCPS_ESTABLISHED)) { 1318 ti_locked = TI_RLOCKED; 1319 INP_INFO_RLOCK_ET(&V_tcbinfo, et); 1320 } 1321 } /** end while(m) */ 1322 } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */ 1323 if (in_pcbrele_wlocked(inp) == 0) 1324 INP_WUNLOCK(inp); 1325 if (ti_locked == TI_RLOCKED) 1326 INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); 1327 INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); 1328 INP_UNLOCK_ASSERT(inp); 1329 ti_locked = TI_UNLOCKED; 1330 mtx_lock(&hpts->p_mtx); 1331 hpts->p_inp = NULL; 1332 CURVNET_RESTORE(); 1333 } 1334 } 1335 1336 static int 1337 tcp_hpts_est_run(struct tcp_hpts_entry *hpts) 1338 { 1339 int32_t ticks_to_run; 1340 1341 if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) { 1342 ticks_to_run = hpts->p_curtick - hpts->p_prevtick; 1343 if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) { 1344 ticks_to_run = NUM_OF_HPTSI_SLOTS - 2; 1345 } 1346 } else { 1347 if (hpts->p_prevtick == hpts->p_curtick) { 1348 /* This happens when we get woken up right away */ 1349 return (-1); 1350 } 1351 ticks_to_run = 1; 1352 } 1353 /* Set in where we will be when we catch up */ 1354 hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS; 1355 if (hpts->p_nxt_slot == hpts->p_cur_slot) { 1356 panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d", 1357 hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run); 1358 } 1359 return (ticks_to_run); 1360 } 1361 1362 static void 1363 tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) 1364 { 1365 struct tcpcb *tp; 1366 struct inpcb *inp = NULL, *ninp; 1367 struct timeval tv; 1368 int32_t ticks_to_run, i, error, tick_now, interum_tick; 1369 int32_t paced_cnt = 0; 1370 int32_t did_prefetch = 0; 1371 int32_t prefetch_ninp = 0; 1372 int32_t prefetch_tp = 0; 1373 uint32_t cts; 1374 int16_t set_cpu; 1375 1376 HPTS_MTX_ASSERT(hpts); 1377 hpts->p_curtick = tcp_tv_to_hptstick(ctick); 1378 cts = tcp_tv_to_usectick(ctick); 1379 memcpy(&tv, ctick, sizeof(struct timeval)); 1380 hpts->p_cur_slot = hpts_tick(hpts, 1); 1381 1382 /* Figure out if we had missed ticks */ 1383 again: 1384 HPTS_MTX_ASSERT(hpts); 1385 ticks_to_run = tcp_hpts_est_run(hpts); 1386 if (!TAILQ_EMPTY(&hpts->p_input)) { 1387 tcp_input_data(hpts, &tv); 1388 } 1389 #ifdef INVARIANTS 1390 if (TAILQ_EMPTY(&hpts->p_input) && 1391 (hpts->p_on_inqueue_cnt != 0)) { 1392 panic("tp:%p in_hpts input empty but cnt:%d", 1393 hpts, hpts->p_on_inqueue_cnt); 1394 } 1395 #endif 1396 HPTS_MTX_ASSERT(hpts); 1397 /* Reset the ticks to run and time if we need too */ 1398 interum_tick = tcp_gethptstick(&tv); 1399 if (interum_tick != hpts->p_curtick) { 1400 /* Save off the new time we execute to */ 1401 *ctick = tv; 1402 hpts->p_curtick = interum_tick; 1403 cts = tcp_tv_to_usectick(&tv); 1404 hpts->p_cur_slot = hpts_tick(hpts, 1); 1405 ticks_to_run = tcp_hpts_est_run(hpts); 1406 } 1407 if (ticks_to_run == -1) { 1408 goto no_run; 1409 } 1410 if (logging_on) { 1411 tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0); 1412 } 1413 if (hpts->p_on_queue_cnt == 0) { 1414 goto no_one; 1415 } 1416 HPTS_MTX_ASSERT(hpts); 1417 for (i = 0; i < ticks_to_run; i++) { 1418 /* 1419 * Calculate our delay, if there are no extra ticks there 1420 * was not any 1421 */ 1422 hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC; 1423 HPTS_MTX_ASSERT(hpts); 1424 while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { 1425 /* For debugging */ 1426 if (logging_on) { 1427 tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i); 1428 } 1429 hpts->p_inp = inp; 1430 paced_cnt++; 1431 if (hpts->p_cur_slot != inp->inp_hptsslot) { 1432 panic("Hpts:%p inp:%p slot mis-aligned %u vs %u", 1433 hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot); 1434 } 1435 /* Now pull it */ 1436 if (inp->inp_hpts_cpu_set == 0) { 1437 set_cpu = 1; 1438 } else { 1439 set_cpu = 0; 1440 } 1441 hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0); 1442 if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { 1443 /* We prefetch the next inp if possible */ 1444 kern_prefetch(ninp, &prefetch_ninp); 1445 prefetch_ninp = 1; 1446 } 1447 if (inp->inp_hpts_request) { 1448 /* 1449 * This guy is deferred out further in time 1450 * then our wheel had on it. Push him back 1451 * on the wheel. 1452 */ 1453 int32_t remaining_slots; 1454 1455 remaining_slots = ticks_to_run - (i + 1); 1456 if (inp->inp_hpts_request > remaining_slots) { 1457 /* 1458 * Keep INVARIANTS happy by clearing 1459 * the flag 1460 */ 1461 tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1); 1462 hpts->p_inp = NULL; 1463 continue; 1464 } 1465 inp->inp_hpts_request = 0; 1466 } 1467 /* 1468 * We clear the hpts flag here after dealing with 1469 * remaining slots. This way anyone looking with the 1470 * TCB lock will see its on the hpts until just 1471 * before we unlock. 1472 */ 1473 inp->inp_in_hpts = 0; 1474 mtx_unlock(&hpts->p_mtx); 1475 INP_WLOCK(inp); 1476 if (in_pcbrele_wlocked(inp)) { 1477 mtx_lock(&hpts->p_mtx); 1478 if (logging_on) 1479 tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1); 1480 hpts->p_inp = NULL; 1481 continue; 1482 } 1483 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 1484 out_now: 1485 #ifdef INVARIANTS 1486 if (mtx_owned(&hpts->p_mtx)) { 1487 panic("Hpts:%p owns mtx prior-to lock line:%d", 1488 hpts, __LINE__); 1489 } 1490 #endif 1491 INP_WUNLOCK(inp); 1492 mtx_lock(&hpts->p_mtx); 1493 if (logging_on) 1494 tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3); 1495 hpts->p_inp = NULL; 1496 continue; 1497 } 1498 tp = intotcpcb(inp); 1499 if ((tp == NULL) || (tp->t_inpcb == NULL)) { 1500 goto out_now; 1501 } 1502 if (set_cpu) { 1503 /* 1504 * Setup so the next time we will move to 1505 * the right CPU. This should be a rare 1506 * event. It will sometimes happens when we 1507 * are the client side (usually not the 1508 * server). Somehow tcp_output() gets called 1509 * before the tcp_do_segment() sets the 1510 * intial state. This means the r_cpu and 1511 * r_hpts_cpu is 0. We get on the hpts, and 1512 * then tcp_input() gets called setting up 1513 * the r_cpu to the correct value. The hpts 1514 * goes off and sees the mis-match. We 1515 * simply correct it here and the CPU will 1516 * switch to the new hpts nextime the tcb 1517 * gets added to the the hpts (not this one) 1518 * :-) 1519 */ 1520 tcp_set_hpts(inp); 1521 } 1522 if (out_newts_every_tcb) { 1523 struct timeval sv; 1524 1525 if (out_ts_percision) 1526 microuptime(&sv); 1527 else 1528 getmicrouptime(&sv); 1529 cts = tcp_tv_to_usectick(&sv); 1530 } 1531 CURVNET_SET(inp->inp_vnet); 1532 /* 1533 * There is a hole here, we get the refcnt on the 1534 * inp so it will still be preserved but to make 1535 * sure we can get the INP we need to hold the p_mtx 1536 * above while we pull out the tp/inp, as long as 1537 * fini gets the lock first we are assured of having 1538 * a sane INP we can lock and test. 1539 */ 1540 #ifdef INVARIANTS 1541 if (mtx_owned(&hpts->p_mtx)) { 1542 panic("Hpts:%p owns mtx before tcp-output:%d", 1543 hpts, __LINE__); 1544 } 1545 #endif 1546 if (tp->t_fb_ptr != NULL) { 1547 kern_prefetch(tp->t_fb_ptr, &did_prefetch); 1548 did_prefetch = 1; 1549 } 1550 inp->inp_hpts_calls = 1; 1551 if (tp->t_fb->tfb_tcp_output_wtime != NULL) { 1552 error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv); 1553 } else { 1554 error = tp->t_fb->tfb_tcp_output(tp); 1555 } 1556 if (ninp && ninp->inp_ppcb) { 1557 /* 1558 * If we have a nxt inp, see if we can 1559 * prefetch its ppcb. Note this may seem 1560 * "risky" since we have no locks (other 1561 * than the previous inp) and there no 1562 * assurance that ninp was not pulled while 1563 * we were processing inp and freed. If this 1564 * occured it could mean that either: 1565 * 1566 * a) Its NULL (which is fine we won't go 1567 * here) <or> b) Its valid (which is cool we 1568 * will prefetch it) <or> c) The inp got 1569 * freed back to the slab which was 1570 * reallocated. Then the piece of memory was 1571 * re-used and something else (not an 1572 * address) is in inp_ppcb. If that occurs 1573 * we don't crash, but take a TLB shootdown 1574 * performance hit (same as if it was NULL 1575 * and we tried to pre-fetch it). 1576 * 1577 * Considering that the likelyhood of <c> is 1578 * quite rare we will take a risk on doing 1579 * this. If performance drops after testing 1580 * we can always take this out. NB: the 1581 * kern_prefetch on amd64 actually has 1582 * protection against a bad address now via 1583 * the DMAP_() tests. This will prevent the 1584 * TLB hit, and instead if <c> occurs just 1585 * cause us to load cache with a useless 1586 * address (to us). 1587 */ 1588 kern_prefetch(ninp->inp_ppcb, &prefetch_tp); 1589 prefetch_tp = 1; 1590 } 1591 INP_WUNLOCK(inp); 1592 INP_UNLOCK_ASSERT(inp); 1593 CURVNET_RESTORE(); 1594 #ifdef INVARIANTS 1595 if (mtx_owned(&hpts->p_mtx)) { 1596 panic("Hpts:%p owns mtx prior-to lock line:%d", 1597 hpts, __LINE__); 1598 } 1599 #endif 1600 mtx_lock(&hpts->p_mtx); 1601 if (logging_on) 1602 tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4); 1603 hpts->p_inp = NULL; 1604 } 1605 HPTS_MTX_ASSERT(hpts); 1606 hpts->p_inp = NULL; 1607 hpts->p_cur_slot++; 1608 if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) { 1609 hpts->p_cur_slot = 0; 1610 } 1611 } 1612 no_one: 1613 HPTS_MTX_ASSERT(hpts); 1614 hpts->p_prevtick = hpts->p_curtick; 1615 hpts->p_delayed_by = 0; 1616 /* 1617 * Check to see if we took an excess amount of time and need to run 1618 * more ticks (if we did not hit eno-bufs). 1619 */ 1620 /* Re-run any input that may be there */ 1621 (void)tcp_gethptstick(&tv); 1622 if (!TAILQ_EMPTY(&hpts->p_input)) { 1623 tcp_input_data(hpts, &tv); 1624 } 1625 #ifdef INVARIANTS 1626 if (TAILQ_EMPTY(&hpts->p_input) && 1627 (hpts->p_on_inqueue_cnt != 0)) { 1628 panic("tp:%p in_hpts input empty but cnt:%d", 1629 hpts, hpts->p_on_inqueue_cnt); 1630 } 1631 #endif 1632 tick_now = tcp_gethptstick(&tv); 1633 if (SEQ_GT(tick_now, hpts->p_prevtick)) { 1634 struct timeval res; 1635 1636 /* Did we really spend a full tick or more in here? */ 1637 timersub(&tv, ctick, &res); 1638 if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) { 1639 counter_u64_add(hpts_loops, 1); 1640 if (logging_on) { 1641 tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now); 1642 } 1643 *ctick = res; 1644 hpts->p_curtick = tick_now; 1645 goto again; 1646 } 1647 } 1648 no_run: 1649 { 1650 uint32_t t = 0, i, fnd = 0; 1651 1652 if (hpts->p_on_queue_cnt) { 1653 1654 1655 /* 1656 * Find next slot that is occupied and use that to 1657 * be the sleep time. 1658 */ 1659 for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) { 1660 if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) { 1661 fnd = 1; 1662 break; 1663 } 1664 t = (t + 1) % NUM_OF_HPTSI_SLOTS; 1665 } 1666 if (fnd) { 1667 hpts->p_hpts_sleep_time = i; 1668 } else { 1669 counter_u64_add(back_tosleep, 1); 1670 #ifdef INVARIANTS 1671 panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt); 1672 #endif 1673 hpts->p_on_queue_cnt = 0; 1674 goto non_found; 1675 } 1676 t++; 1677 } else { 1678 /* No one on the wheel sleep for all but 2 slots */ 1679 non_found: 1680 if (hpts_sleep_max == 0) 1681 hpts_sleep_max = 1; 1682 hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max); 1683 t = 0; 1684 } 1685 if (logging_on) { 1686 tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC)); 1687 } 1688 } 1689 } 1690 1691 void 1692 __tcp_set_hpts(struct inpcb *inp, int32_t line) 1693 { 1694 struct tcp_hpts_entry *hpts; 1695 1696 INP_WLOCK_ASSERT(inp); 1697 hpts = tcp_hpts_lock(inp); 1698 if ((inp->inp_in_hpts == 0) && 1699 (inp->inp_hpts_cpu_set == 0)) { 1700 inp->inp_hpts_cpu = hpts_cpuid(inp); 1701 inp->inp_hpts_cpu_set = 1; 1702 } 1703 mtx_unlock(&hpts->p_mtx); 1704 hpts = tcp_input_lock(inp); 1705 if ((inp->inp_input_cpu_set == 0) && 1706 (inp->inp_in_input == 0)) { 1707 inp->inp_input_cpu = hpts_cpuid(inp); 1708 inp->inp_input_cpu_set = 1; 1709 } 1710 mtx_unlock(&hpts->p_mtx); 1711 } 1712 1713 uint16_t 1714 tcp_hpts_delayedby(struct inpcb *inp){ 1715 return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by); 1716 } 1717 1718 static void 1719 tcp_hpts_thread(void *ctx) 1720 { 1721 struct tcp_hpts_entry *hpts; 1722 struct timeval tv; 1723 sbintime_t sb; 1724 1725 hpts = (struct tcp_hpts_entry *)ctx; 1726 mtx_lock(&hpts->p_mtx); 1727 if (hpts->p_direct_wake) { 1728 /* Signaled by input */ 1729 if (logging_on) 1730 tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1); 1731 callout_stop(&hpts->co); 1732 } else { 1733 /* Timed out */ 1734 if (callout_pending(&hpts->co) || 1735 !callout_active(&hpts->co)) { 1736 if (logging_on) 1737 tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2); 1738 mtx_unlock(&hpts->p_mtx); 1739 return; 1740 } 1741 callout_deactivate(&hpts->co); 1742 if (logging_on) 1743 tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3); 1744 } 1745 hpts->p_hpts_active = 1; 1746 (void)tcp_gethptstick(&tv); 1747 tcp_hptsi(hpts, &tv); 1748 HPTS_MTX_ASSERT(hpts); 1749 tv.tv_sec = 0; 1750 tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; 1751 if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) { 1752 tv.tv_usec = tcp_min_hptsi_time; 1753 hpts->p_on_min_sleep = 1; 1754 } else { 1755 /* Clear the min sleep flag */ 1756 hpts->p_on_min_sleep = 0; 1757 } 1758 hpts->p_hpts_active = 0; 1759 sb = tvtosbt(tv); 1760 if (tcp_hpts_callout_skip_swi == 0) { 1761 callout_reset_sbt_on(&hpts->co, sb, 0, 1762 hpts_timeout_swi, hpts, hpts->p_cpu, 1763 (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 1764 } else { 1765 callout_reset_sbt_on(&hpts->co, sb, 0, 1766 hpts_timeout_dir, hpts, 1767 hpts->p_cpu, 1768 C_PREL(tcp_hpts_precision)); 1769 } 1770 hpts->p_direct_wake = 0; 1771 mtx_unlock(&hpts->p_mtx); 1772 } 1773 1774 #undef timersub 1775 1776 static void 1777 tcp_init_hptsi(void *st) 1778 { 1779 int32_t i, j, error, bound = 0, created = 0; 1780 size_t sz, asz; 1781 struct timeval tv; 1782 sbintime_t sb; 1783 struct tcp_hpts_entry *hpts; 1784 char unit[16]; 1785 uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU; 1786 1787 tcp_pace.rp_proc = NULL; 1788 tcp_pace.rp_num_hptss = ncpus; 1789 hpts_loops = counter_u64_alloc(M_WAITOK); 1790 back_tosleep = counter_u64_alloc(M_WAITOK); 1791 1792 sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); 1793 tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); 1794 asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; 1795 for (i = 0; i < tcp_pace.rp_num_hptss; i++) { 1796 tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry), 1797 M_TCPHPTS, M_WAITOK | M_ZERO); 1798 tcp_pace.rp_ent[i]->p_hptss = malloc(asz, 1799 M_TCPHPTS, M_WAITOK); 1800 hpts = tcp_pace.rp_ent[i]; 1801 /* 1802 * Init all the hpts structures that are not specifically 1803 * zero'd by the allocations. Also lets attach them to the 1804 * appropriate sysctl block as well. 1805 */ 1806 mtx_init(&hpts->p_mtx, "tcp_hpts_lck", 1807 "hpts", MTX_DEF | MTX_DUPOK); 1808 TAILQ_INIT(&hpts->p_input); 1809 for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) { 1810 TAILQ_INIT(&hpts->p_hptss[j]); 1811 } 1812 sysctl_ctx_init(&hpts->hpts_ctx); 1813 sprintf(unit, "%d", i); 1814 hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx, 1815 SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts), 1816 OID_AUTO, 1817 unit, 1818 CTLFLAG_RW, 0, 1819 ""); 1820 SYSCTL_ADD_INT(&hpts->hpts_ctx, 1821 SYSCTL_CHILDREN(hpts->hpts_root), 1822 OID_AUTO, "in_qcnt", CTLFLAG_RD, 1823 &hpts->p_on_inqueue_cnt, 0, 1824 "Count TCB's awaiting input processing"); 1825 SYSCTL_ADD_INT(&hpts->hpts_ctx, 1826 SYSCTL_CHILDREN(hpts->hpts_root), 1827 OID_AUTO, "out_qcnt", CTLFLAG_RD, 1828 &hpts->p_on_queue_cnt, 0, 1829 "Count TCB's awaiting output processing"); 1830 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1831 SYSCTL_CHILDREN(hpts->hpts_root), 1832 OID_AUTO, "active", CTLFLAG_RD, 1833 &hpts->p_hpts_active, 0, 1834 "Is the hpts active"); 1835 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1836 SYSCTL_CHILDREN(hpts->hpts_root), 1837 OID_AUTO, "curslot", CTLFLAG_RD, 1838 &hpts->p_cur_slot, 0, 1839 "What the current slot is if active"); 1840 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1841 SYSCTL_CHILDREN(hpts->hpts_root), 1842 OID_AUTO, "curtick", CTLFLAG_RD, 1843 &hpts->p_curtick, 0, 1844 "What the current tick on if active"); 1845 SYSCTL_ADD_UINT(&hpts->hpts_ctx, 1846 SYSCTL_CHILDREN(hpts->hpts_root), 1847 OID_AUTO, "logsize", CTLFLAG_RD, 1848 &hpts->p_logsize, 0, 1849 "Hpts logging buffer size"); 1850 hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2; 1851 hpts->p_num = i; 1852 hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv); 1853 hpts->p_prevtick -= 1; 1854 hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS; 1855 hpts->p_cpu = 0xffff; 1856 hpts->p_nxt_slot = 1; 1857 hpts->p_logsize = tcp_hpts_logging_size; 1858 if (hpts->p_logsize) { 1859 sz = (sizeof(struct hpts_log) * hpts->p_logsize); 1860 hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); 1861 } 1862 callout_init(&hpts->co, 1); 1863 } 1864 /* 1865 * Now lets start ithreads to handle the hptss. 1866 */ 1867 CPU_FOREACH(i) { 1868 hpts = tcp_pace.rp_ent[i]; 1869 hpts->p_cpu = i; 1870 error = swi_add(&hpts->ie, "hpts", 1871 tcp_hpts_thread, (void *)hpts, 1872 SWI_NET, INTR_MPSAFE, &hpts->ie_cookie); 1873 if (error) { 1874 panic("Can't add hpts:%p i:%d err:%d", 1875 hpts, i, error); 1876 } 1877 created++; 1878 if (tcp_bind_threads) { 1879 if (intr_event_bind(hpts->ie, i) == 0) 1880 bound++; 1881 } 1882 tv.tv_sec = 0; 1883 tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; 1884 sb = tvtosbt(tv); 1885 if (tcp_hpts_callout_skip_swi == 0) { 1886 callout_reset_sbt_on(&hpts->co, sb, 0, 1887 hpts_timeout_swi, hpts, hpts->p_cpu, 1888 (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision))); 1889 } else { 1890 callout_reset_sbt_on(&hpts->co, sb, 0, 1891 hpts_timeout_dir, hpts, 1892 hpts->p_cpu, 1893 C_PREL(tcp_hpts_precision)); 1894 } 1895 } 1896 printf("TCP Hpts created %d swi interrupt thread and bound %d\n", 1897 created, bound); 1898 return; 1899 } 1900 1901 SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL); 1902 MODULE_VERSION(tcphpts, 1); 1903