1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "io_uring.h" 4 #include "napi.h" 5 6 #ifdef CONFIG_NET_RX_BUSY_POLL 7 8 /* Timeout for cleanout of stale entries. */ 9 #define NAPI_TIMEOUT (60 * SEC_CONVERSION) 10 11 struct io_napi_entry { 12 unsigned int napi_id; 13 struct list_head list; 14 15 unsigned long timeout; 16 struct hlist_node node; 17 18 struct rcu_head rcu; 19 }; 20 21 static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, 22 unsigned int napi_id) 23 { 24 struct io_napi_entry *e; 25 26 hlist_for_each_entry_rcu(e, hash_list, node) { 27 if (e->napi_id != napi_id) 28 continue; 29 return e; 30 } 31 32 return NULL; 33 } 34 35 static inline ktime_t net_to_ktime(unsigned long t) 36 { 37 /* napi approximating usecs, reverse busy_loop_current_time */ 38 return ns_to_ktime(t << 10); 39 } 40 41 int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id) 42 { 43 struct hlist_head *hash_list; 44 struct io_napi_entry *e; 45 46 /* Non-NAPI IDs can be rejected. */ 47 if (!napi_id_valid(napi_id)) 48 return -EINVAL; 49 50 hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; 51 52 scoped_guard(rcu) { 53 e = io_napi_hash_find(hash_list, napi_id); 54 if (e) { 55 WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); 56 return -EEXIST; 57 } 58 } 59 60 e = kmalloc(sizeof(*e), GFP_NOWAIT); 61 if (!e) 62 return -ENOMEM; 63 64 e->napi_id = napi_id; 65 e->timeout = jiffies + NAPI_TIMEOUT; 66 67 /* 68 * guard(spinlock) is not used to manually unlock it before calling 69 * kfree() 70 */ 71 spin_lock(&ctx->napi_lock); 72 if (unlikely(io_napi_hash_find(hash_list, napi_id))) { 73 spin_unlock(&ctx->napi_lock); 74 kfree(e); 75 return -EEXIST; 76 } 77 78 hlist_add_tail_rcu(&e->node, hash_list); 79 list_add_tail_rcu(&e->list, &ctx->napi_list); 80 spin_unlock(&ctx->napi_lock); 81 return 0; 82 } 83 84 static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id) 85 { 86 struct hlist_head *hash_list; 87 struct io_napi_entry *e; 88 89 /* Non-NAPI IDs can be rejected. */ 90 if (!napi_id_valid(napi_id)) 91 return -EINVAL; 92 93 hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; 94 guard(spinlock)(&ctx->napi_lock); 95 e = io_napi_hash_find(hash_list, napi_id); 96 if (!e) 97 return -ENOENT; 98 99 list_del_rcu(&e->list); 100 hash_del_rcu(&e->node); 101 kfree_rcu(e, rcu); 102 return 0; 103 } 104 105 static void __io_napi_remove_stale(struct io_ring_ctx *ctx) 106 { 107 struct io_napi_entry *e; 108 109 guard(spinlock)(&ctx->napi_lock); 110 /* 111 * list_for_each_entry_safe() is not required as long as: 112 * 1. list_del_rcu() does not reset the deleted node next pointer 113 * 2. kfree_rcu() delays the memory freeing until the next quiescent 114 * state 115 */ 116 list_for_each_entry(e, &ctx->napi_list, list) { 117 if (time_after(jiffies, READ_ONCE(e->timeout))) { 118 list_del_rcu(&e->list); 119 hash_del_rcu(&e->node); 120 kfree_rcu(e, rcu); 121 } 122 } 123 } 124 125 static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) 126 { 127 if (is_stale) 128 __io_napi_remove_stale(ctx); 129 } 130 131 static inline bool io_napi_busy_loop_timeout(ktime_t start_time, 132 ktime_t bp) 133 { 134 if (bp) { 135 ktime_t end_time = ktime_add(start_time, bp); 136 ktime_t now = net_to_ktime(busy_loop_current_time()); 137 138 return ktime_after(now, end_time); 139 } 140 141 return true; 142 } 143 144 static bool io_napi_busy_loop_should_end(void *data, 145 unsigned long start_time) 146 { 147 struct io_wait_queue *iowq = data; 148 149 if (signal_pending(current)) 150 return true; 151 if (io_should_wake(iowq) || io_has_work(iowq->ctx)) 152 return true; 153 if (io_napi_busy_loop_timeout(net_to_ktime(start_time), 154 iowq->napi_busy_poll_dt)) 155 return true; 156 157 return false; 158 } 159 160 /* 161 * never report stale entries 162 */ 163 static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx, 164 bool (*loop_end)(void *, unsigned long), 165 void *loop_end_arg) 166 { 167 struct io_napi_entry *e; 168 169 list_for_each_entry_rcu(e, &ctx->napi_list, list) 170 napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, 171 ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); 172 return false; 173 } 174 175 static bool 176 dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx, 177 bool (*loop_end)(void *, unsigned long), 178 void *loop_end_arg) 179 { 180 struct io_napi_entry *e; 181 bool is_stale = false; 182 183 list_for_each_entry_rcu(e, &ctx->napi_list, list) { 184 napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, 185 ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); 186 187 if (time_after(jiffies, READ_ONCE(e->timeout))) 188 is_stale = true; 189 } 190 191 return is_stale; 192 } 193 194 static inline bool 195 __io_napi_do_busy_loop(struct io_ring_ctx *ctx, 196 bool (*loop_end)(void *, unsigned long), 197 void *loop_end_arg) 198 { 199 if (READ_ONCE(ctx->napi_track_mode) == IO_URING_NAPI_TRACKING_STATIC) 200 return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); 201 return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); 202 } 203 204 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, 205 struct io_wait_queue *iowq) 206 { 207 unsigned long start_time = busy_loop_current_time(); 208 bool (*loop_end)(void *, unsigned long) = NULL; 209 void *loop_end_arg = NULL; 210 bool is_stale = false; 211 212 /* Singular lists use a different napi loop end check function and are 213 * only executed once. 214 */ 215 if (list_is_singular(&ctx->napi_list)) { 216 loop_end = io_napi_busy_loop_should_end; 217 loop_end_arg = iowq; 218 } 219 220 scoped_guard(rcu) { 221 do { 222 is_stale = __io_napi_do_busy_loop(ctx, loop_end, 223 loop_end_arg); 224 } while (!io_napi_busy_loop_should_end(iowq, start_time) && 225 !loop_end_arg); 226 } 227 228 io_napi_remove_stale(ctx, is_stale); 229 } 230 231 /* 232 * io_napi_init() - Init napi settings 233 * @ctx: pointer to io-uring context structure 234 * 235 * Init napi settings in the io-uring context. 236 */ 237 void io_napi_init(struct io_ring_ctx *ctx) 238 { 239 u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC; 240 241 INIT_LIST_HEAD(&ctx->napi_list); 242 spin_lock_init(&ctx->napi_lock); 243 ctx->napi_prefer_busy_poll = false; 244 ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); 245 ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE; 246 } 247 248 /* 249 * io_napi_free() - Deallocate napi 250 * @ctx: pointer to io-uring context structure 251 * 252 * Free the napi list and the hash table in the io-uring context. 253 */ 254 void io_napi_free(struct io_ring_ctx *ctx) 255 { 256 struct io_napi_entry *e; 257 258 guard(spinlock)(&ctx->napi_lock); 259 list_for_each_entry(e, &ctx->napi_list, list) { 260 hash_del_rcu(&e->node); 261 kfree_rcu(e, rcu); 262 } 263 INIT_LIST_HEAD_RCU(&ctx->napi_list); 264 } 265 266 static int io_napi_register_napi(struct io_ring_ctx *ctx, 267 struct io_uring_napi *napi) 268 { 269 switch (napi->op_param) { 270 case IO_URING_NAPI_TRACKING_DYNAMIC: 271 case IO_URING_NAPI_TRACKING_STATIC: 272 break; 273 default: 274 return -EINVAL; 275 } 276 /* clean the napi list for new settings */ 277 io_napi_free(ctx); 278 WRITE_ONCE(ctx->napi_track_mode, napi->op_param); 279 /* cap NAPI at 10 msec of spin time */ 280 napi->busy_poll_to = min(10000, napi->busy_poll_to); 281 WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); 282 WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); 283 return 0; 284 } 285 286 /* 287 * io_napi_register() - Register napi with io-uring 288 * @ctx: pointer to io-uring context structure 289 * @arg: pointer to io_uring_napi structure 290 * 291 * Register napi in the io-uring context. 292 */ 293 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) 294 { 295 const struct io_uring_napi curr = { 296 .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), 297 .prefer_busy_poll = ctx->napi_prefer_busy_poll, 298 .op_param = ctx->napi_track_mode 299 }; 300 struct io_uring_napi napi; 301 302 if (ctx->flags & IORING_SETUP_IOPOLL) 303 return -EINVAL; 304 if (copy_from_user(&napi, arg, sizeof(napi))) 305 return -EFAULT; 306 if (napi.pad[0] || napi.pad[1] || napi.resv) 307 return -EINVAL; 308 309 if (copy_to_user(arg, &curr, sizeof(curr))) 310 return -EFAULT; 311 312 switch (napi.opcode) { 313 case IO_URING_NAPI_REGISTER_OP: 314 return io_napi_register_napi(ctx, &napi); 315 case IO_URING_NAPI_STATIC_ADD_ID: 316 if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) 317 return -EINVAL; 318 return __io_napi_add_id(ctx, napi.op_param); 319 case IO_URING_NAPI_STATIC_DEL_ID: 320 if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) 321 return -EINVAL; 322 return __io_napi_del_id(ctx, napi.op_param); 323 default: 324 return -EINVAL; 325 } 326 } 327 328 /* 329 * io_napi_unregister() - Unregister napi with io-uring 330 * @ctx: pointer to io-uring context structure 331 * @arg: pointer to io_uring_napi structure 332 * 333 * Unregister napi. If arg has been specified copy the busy poll timeout and 334 * prefer busy poll setting to the passed in structure. 335 */ 336 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) 337 { 338 const struct io_uring_napi curr = { 339 .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), 340 .prefer_busy_poll = ctx->napi_prefer_busy_poll 341 }; 342 343 if (arg && copy_to_user(arg, &curr, sizeof(curr))) 344 return -EFAULT; 345 346 WRITE_ONCE(ctx->napi_busy_poll_dt, 0); 347 WRITE_ONCE(ctx->napi_prefer_busy_poll, false); 348 WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); 349 return 0; 350 } 351 352 /* 353 * __io_napi_busy_loop() - execute busy poll loop 354 * @ctx: pointer to io-uring context structure 355 * @iowq: pointer to io wait queue 356 * 357 * Execute the busy poll loop and merge the spliced off list. 358 */ 359 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) 360 { 361 if (ctx->flags & IORING_SETUP_SQPOLL) 362 return; 363 364 iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); 365 if (iowq->timeout != KTIME_MAX) { 366 ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx)); 367 368 iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt); 369 } 370 371 iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); 372 io_napi_blocking_busy_loop(ctx, iowq); 373 } 374 375 /* 376 * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll 377 * @ctx: pointer to io-uring context structure 378 * 379 * Splice of the napi list and execute the napi busy poll loop. 380 */ 381 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) 382 { 383 bool is_stale = false; 384 385 if (!READ_ONCE(ctx->napi_busy_poll_dt)) 386 return 0; 387 if (list_empty_careful(&ctx->napi_list)) 388 return 0; 389 390 scoped_guard(rcu) { 391 is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL); 392 } 393 394 io_napi_remove_stale(ctx, is_stale); 395 return 1; 396 } 397 398 #endif 399