1 /*- 2 * Copyright (c) 2025 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 */ 25 26 #ifndef __tcp_hpts_internal_h__ 27 #define __tcp_hpts_internal_h__ 28 29 /* 30 * TCP High Precision Timer System (HPTS) - Internal Definitions 31 * 32 * This header contains internal structures, constants, and interfaces that are 33 * implemented in tcp_hpts.c but exposed to enable comprehensive unit testing of 34 * the HPTS subsystem. 35 */ 36 37 #if defined(_KERNEL) 38 39 /* 40 * The hpts uses a 102400 wheel. The wheel 41 * defines the time in 10 usec increments (102400 x 10). 42 * This gives a range of 10usec - 1024ms to place 43 * an entry within. If the user requests more than 44 * 1.024 second, a remaineder is attached and the hpts 45 * when seeing the remainder will re-insert the 46 * inpcb forward in time from where it is until 47 * the remainder is zero. 48 */ 49 50 #define NUM_OF_HPTSI_SLOTS 102400 51 52 /* The number of connections after which the dynamic sleep logic kicks in. */ 53 #define DEFAULT_CONNECTION_THRESHOLD 100 54 55 /* 56 * The hpts uses a 102400 wheel. The wheel 57 * defines the time in 10 usec increments (102400 x 10). 58 * This gives a range of 10usec - 1024ms to place 59 * an entry within. If the user requests more than 60 * 1.024 second, a remaineder is attached and the hpts 61 * when seeing the remainder will re-insert the 62 * inpcb forward in time from where it is until 63 * the remainder is zero. 64 */ 65 66 #define NUM_OF_HPTSI_SLOTS 102400 67 68 /* Convert microseconds to HPTS slots */ 69 #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) 70 71 /* The number of connections after which the dynamic sleep logic kicks in. */ 72 #define DEFAULT_CONNECTION_THRESHOLD 100 73 74 extern int tcp_bind_threads; /* Thread binding configuration 75 * (0=none, 1=cpu, 2=numa) */ 76 77 /* 78 * Abstraction layer controlling time, interrupts and callouts. 79 */ 80 struct tcp_hptsi_funcs { 81 void (*microuptime)(struct timeval *tv); 82 int (*swi_add)(struct intr_event **eventp, const char *name, 83 driver_intr_t handler, void *arg, int pri, enum intr_type flags, 84 void **cookiep); 85 int (*swi_remove)(void *cookie); 86 void (*swi_sched)(void *cookie, int flags); 87 int (*intr_event_bind)(struct intr_event *ie, int cpu); 88 int (*intr_event_bind_ithread_cpuset)(struct intr_event *ie, 89 struct _cpuset *mask); 90 void (*callout_init)(struct callout *c, int mpsafe); 91 int (*callout_reset_sbt_on)(struct callout *c, sbintime_t sbt, 92 sbintime_t precision, void (*func)(void *), void *arg, int cpu, 93 int flags); 94 int (*_callout_stop_safe)(struct callout *c, int flags); 95 }; 96 97 /* Default function table for system operation */ 98 extern const struct tcp_hptsi_funcs tcp_hptsi_default_funcs; 99 100 /* Each hpts has its own p_mtx which is used for locking */ 101 #define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED) 102 #define HPTS_LOCK(hpts) mtx_lock(&(hpts)->p_mtx) 103 #define HPTS_TRYLOCK(hpts) mtx_trylock(&(hpts)->p_mtx) 104 #define HPTS_UNLOCK(hpts) mtx_unlock(&(hpts)->p_mtx) 105 106 struct tcp_hpts_entry { 107 /* Cache line 0x00 */ 108 struct mtx p_mtx; /* Mutex for hpts */ 109 struct timeval p_mysleep; /* Our min sleep time */ 110 uint64_t syscall_cnt; 111 uint64_t sleeping; /* What the actual sleep was (if sleeping) */ 112 uint16_t p_hpts_active; /* Flag that says hpts is awake */ 113 uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ 114 uint32_t p_runningslot; /* Current slot we are at if we are running */ 115 uint32_t p_prev_slot; /* Previous slot we were on */ 116 uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ 117 uint32_t p_nxt_slot; /* The next slot outside the current range 118 * of slots that the hpts is running on. */ 119 int32_t p_on_queue_cnt; /* Count on queue in this hpts */ 120 uint8_t p_direct_wake :1, /* boolean */ 121 p_on_min_sleep:1, /* boolean */ 122 p_hpts_wake_scheduled:1,/* boolean */ 123 hit_callout_thresh:1, 124 p_avail:4; 125 uint8_t p_fill[3]; /* Fill to 32 bits */ 126 /* Cache line 0x40 */ 127 struct hptsh { 128 TAILQ_HEAD(, tcpcb) head; 129 uint32_t count; 130 uint32_t gencnt; 131 } *p_hptss; /* Hptsi wheel */ 132 uint32_t p_hpts_sleep_time; /* Current sleep interval having a max 133 * of 255ms */ 134 uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ 135 uint32_t saved_curslot; /* for logging */ 136 uint32_t saved_prev_slot; /* for logging */ 137 uint32_t p_delayed_by; /* How much were we delayed by */ 138 /* Cache line 0x80 */ 139 struct sysctl_ctx_list hpts_ctx; 140 struct sysctl_oid *hpts_root; 141 struct intr_event *ie; 142 void *ie_cookie; 143 uint16_t p_cpu; /* The hpts CPU */ 144 struct tcp_hptsi *p_hptsi; /* Back pointer to parent hptsi structure */ 145 /* There is extra space in here */ 146 /* Cache line 0x100 */ 147 struct callout co __aligned(CACHE_LINE_SIZE); 148 } __aligned(CACHE_LINE_SIZE); 149 150 struct tcp_hptsi { 151 struct cpu_group **grps; 152 struct tcp_hpts_entry **rp_ent; /* Array of hptss */ 153 uint32_t *cts_last_ran; 154 uint32_t grp_cnt; 155 uint32_t rp_num_hptss; /* Number of hpts threads */ 156 struct hpts_domain_info { 157 int count; 158 int cpu[MAXCPU]; 159 } domains[MAXMEMDOM]; /* Per-NUMA domain CPU assignments */ 160 const struct tcp_hptsi_funcs *funcs; /* Function table for testability */ 161 }; 162 163 /* 164 * Core tcp_hptsi structure manipulation functions. 165 */ 166 struct tcp_hptsi* tcp_hptsi_create(const struct tcp_hptsi_funcs *funcs, 167 bool enable_sysctl); 168 void tcp_hptsi_destroy(struct tcp_hptsi *pace); 169 void tcp_hptsi_start(struct tcp_hptsi *pace); 170 void tcp_hptsi_stop(struct tcp_hptsi *pace); 171 uint16_t tcp_hptsi_random_cpu(struct tcp_hptsi *pace); 172 int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, bool from_callout); 173 174 void tcp_hpts_wake(struct tcp_hpts_entry *hpts); 175 176 /* 177 * LRO HPTS initialization and uninitialization, only for internal use by the 178 * HPTS code. 179 */ 180 void tcp_lro_hpts_init(void); 181 void tcp_lro_hpts_uninit(void); 182 183 #endif /* defined(_KERNEL) */ 184 #endif /* __tcp_hpts_internal_h__ */ 185