1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "io_uring.h" 4 #include "napi.h" 5 6 #ifdef CONFIG_NET_RX_BUSY_POLL 7 8 /* Timeout for cleanout of stale entries. */ 9 #define NAPI_TIMEOUT (60 * SEC_CONVERSION) 10 11 struct io_napi_entry { 12 unsigned int napi_id; 13 struct list_head list; 14 15 unsigned long timeout; 16 struct hlist_node node; 17 18 struct rcu_head rcu; 19 }; 20 21 static struct io_napi_entry *io_napi_hash_find(struct hlist_head *hash_list, 22 unsigned int napi_id) 23 { 24 struct io_napi_entry *e; 25 26 hlist_for_each_entry_rcu(e, hash_list, node) { 27 if (e->napi_id != napi_id) 28 continue; 29 return e; 30 } 31 32 return NULL; 33 } 34 35 static inline ktime_t net_to_ktime(unsigned long t) 36 { 37 /* napi approximating usecs, reverse busy_loop_current_time */ 38 return ns_to_ktime(t << 10); 39 } 40 41 int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id, 42 unsigned int mode) 43 { 44 struct hlist_head *hash_list; 45 struct io_napi_entry *e; 46 47 /* Non-NAPI IDs can be rejected. */ 48 if (!napi_id_valid(napi_id)) 49 return -EINVAL; 50 51 hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; 52 53 scoped_guard(rcu) { 54 e = io_napi_hash_find(hash_list, napi_id); 55 if (e) { 56 WRITE_ONCE(e->timeout, jiffies + NAPI_TIMEOUT); 57 return -EEXIST; 58 } 59 } 60 61 e = kmalloc(sizeof(*e), GFP_NOWAIT); 62 if (!e) 63 return -ENOMEM; 64 65 e->napi_id = napi_id; 66 e->timeout = jiffies + NAPI_TIMEOUT; 67 68 /* 69 * guard(spinlock) is not used to manually unlock it before calling 70 * kfree() 71 */ 72 spin_lock(&ctx->napi_lock); 73 if (unlikely(READ_ONCE(ctx->napi_track_mode) != mode)) { 74 spin_unlock(&ctx->napi_lock); 75 kfree(e); 76 return -EINVAL; 77 } 78 if (unlikely(io_napi_hash_find(hash_list, napi_id))) { 79 spin_unlock(&ctx->napi_lock); 80 kfree(e); 81 return -EEXIST; 82 } 83 84 hlist_add_tail_rcu(&e->node, hash_list); 85 list_add_tail_rcu(&e->list, &ctx->napi_list); 86 spin_unlock(&ctx->napi_lock); 87 return 0; 88 } 89 90 static int __io_napi_del_id(struct io_ring_ctx *ctx, unsigned int napi_id) 91 { 92 struct hlist_head *hash_list; 93 struct io_napi_entry *e; 94 95 /* Non-NAPI IDs can be rejected. */ 96 if (!napi_id_valid(napi_id)) 97 return -EINVAL; 98 99 hash_list = &ctx->napi_ht[hash_min(napi_id, HASH_BITS(ctx->napi_ht))]; 100 guard(spinlock)(&ctx->napi_lock); 101 e = io_napi_hash_find(hash_list, napi_id); 102 if (!e) 103 return -ENOENT; 104 105 list_del_rcu(&e->list); 106 hash_del_rcu(&e->node); 107 kfree_rcu(e, rcu); 108 return 0; 109 } 110 111 static void __io_napi_remove_stale(struct io_ring_ctx *ctx) 112 { 113 struct io_napi_entry *e; 114 115 guard(spinlock)(&ctx->napi_lock); 116 /* 117 * list_for_each_entry_safe() is not required as long as: 118 * 1. list_del_rcu() does not reset the deleted node next pointer 119 * 2. kfree_rcu() delays the memory freeing until the next quiescent 120 * state 121 */ 122 list_for_each_entry(e, &ctx->napi_list, list) { 123 if (time_after(jiffies, READ_ONCE(e->timeout))) { 124 list_del_rcu(&e->list); 125 hash_del_rcu(&e->node); 126 kfree_rcu(e, rcu); 127 } 128 } 129 } 130 131 static inline void io_napi_remove_stale(struct io_ring_ctx *ctx, bool is_stale) 132 { 133 if (is_stale) 134 __io_napi_remove_stale(ctx); 135 } 136 137 static inline bool io_napi_busy_loop_timeout(ktime_t start_time, 138 ktime_t bp) 139 { 140 if (bp) { 141 ktime_t end_time = ktime_add(start_time, bp); 142 ktime_t now = net_to_ktime(busy_loop_current_time()); 143 144 return ktime_after(now, end_time); 145 } 146 147 return true; 148 } 149 150 static bool io_napi_busy_loop_should_end(void *data, 151 unsigned long start_time) 152 { 153 struct io_wait_queue *iowq = data; 154 155 if (signal_pending(current)) 156 return true; 157 if (io_should_wake(iowq) || io_has_work(iowq->ctx)) 158 return true; 159 if (io_napi_busy_loop_timeout(net_to_ktime(start_time), 160 iowq->napi_busy_poll_dt)) 161 return true; 162 163 return false; 164 } 165 166 /* 167 * never report stale entries 168 */ 169 static bool static_tracking_do_busy_loop(struct io_ring_ctx *ctx, 170 bool (*loop_end)(void *, unsigned long), 171 void *loop_end_arg) 172 { 173 struct io_napi_entry *e; 174 175 list_for_each_entry_rcu(e, &ctx->napi_list, list) 176 napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, 177 ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); 178 return false; 179 } 180 181 static bool 182 dynamic_tracking_do_busy_loop(struct io_ring_ctx *ctx, 183 bool (*loop_end)(void *, unsigned long), 184 void *loop_end_arg) 185 { 186 struct io_napi_entry *e; 187 bool is_stale = false; 188 189 list_for_each_entry_rcu(e, &ctx->napi_list, list) { 190 napi_busy_loop_rcu(e->napi_id, loop_end, loop_end_arg, 191 ctx->napi_prefer_busy_poll, BUSY_POLL_BUDGET); 192 193 if (time_after(jiffies, READ_ONCE(e->timeout))) 194 is_stale = true; 195 } 196 197 return is_stale; 198 } 199 200 static inline bool 201 __io_napi_do_busy_loop(struct io_ring_ctx *ctx, 202 bool (*loop_end)(void *, unsigned long), 203 void *loop_end_arg) 204 { 205 switch (READ_ONCE(ctx->napi_track_mode)) { 206 case IO_URING_NAPI_TRACKING_STATIC: 207 return static_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); 208 case IO_URING_NAPI_TRACKING_DYNAMIC: 209 return dynamic_tracking_do_busy_loop(ctx, loop_end, loop_end_arg); 210 default: 211 return false; 212 } 213 } 214 215 static void io_napi_blocking_busy_loop(struct io_ring_ctx *ctx, 216 struct io_wait_queue *iowq) 217 { 218 unsigned long start_time = busy_loop_current_time(); 219 bool (*loop_end)(void *, unsigned long) = NULL; 220 void *loop_end_arg = NULL; 221 bool is_stale = false; 222 223 /* Singular lists use a different napi loop end check function and are 224 * only executed once. 225 */ 226 if (list_is_singular(&ctx->napi_list)) { 227 loop_end = io_napi_busy_loop_should_end; 228 loop_end_arg = iowq; 229 } 230 231 scoped_guard(rcu) { 232 do { 233 is_stale = __io_napi_do_busy_loop(ctx, loop_end, 234 loop_end_arg); 235 } while (!io_napi_busy_loop_should_end(iowq, start_time) && 236 !loop_end_arg); 237 } 238 239 io_napi_remove_stale(ctx, is_stale); 240 } 241 242 /* 243 * io_napi_init() - Init napi settings 244 * @ctx: pointer to io-uring context structure 245 * 246 * Init napi settings in the io-uring context. 247 */ 248 void io_napi_init(struct io_ring_ctx *ctx) 249 { 250 u64 sys_dt = READ_ONCE(sysctl_net_busy_poll) * NSEC_PER_USEC; 251 252 INIT_LIST_HEAD(&ctx->napi_list); 253 spin_lock_init(&ctx->napi_lock); 254 ctx->napi_prefer_busy_poll = false; 255 ctx->napi_busy_poll_dt = ns_to_ktime(sys_dt); 256 ctx->napi_track_mode = IO_URING_NAPI_TRACKING_INACTIVE; 257 } 258 259 /* 260 * io_napi_free() - Deallocate napi 261 * @ctx: pointer to io-uring context structure 262 * 263 * Free the napi list and the hash table in the io-uring context. 264 */ 265 void io_napi_free(struct io_ring_ctx *ctx) 266 { 267 struct io_napi_entry *e; 268 269 guard(spinlock)(&ctx->napi_lock); 270 list_for_each_entry(e, &ctx->napi_list, list) { 271 hash_del_rcu(&e->node); 272 kfree_rcu(e, rcu); 273 } 274 INIT_LIST_HEAD_RCU(&ctx->napi_list); 275 } 276 277 static int io_napi_register_napi(struct io_ring_ctx *ctx, 278 struct io_uring_napi *napi) 279 { 280 switch (napi->op_param) { 281 case IO_URING_NAPI_TRACKING_DYNAMIC: 282 case IO_URING_NAPI_TRACKING_STATIC: 283 break; 284 default: 285 return -EINVAL; 286 } 287 WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); 288 io_napi_free(ctx); 289 /* cap NAPI at 10 msec of spin time */ 290 napi->busy_poll_to = min(10000, napi->busy_poll_to); 291 WRITE_ONCE(ctx->napi_busy_poll_dt, napi->busy_poll_to * NSEC_PER_USEC); 292 WRITE_ONCE(ctx->napi_prefer_busy_poll, !!napi->prefer_busy_poll); 293 WRITE_ONCE(ctx->napi_track_mode, napi->op_param); 294 return 0; 295 } 296 297 /* 298 * io_napi_register() - Register napi with io-uring 299 * @ctx: pointer to io-uring context structure 300 * @arg: pointer to io_uring_napi structure 301 * 302 * Register napi in the io-uring context. 303 */ 304 int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) 305 { 306 const struct io_uring_napi curr = { 307 .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), 308 .prefer_busy_poll = ctx->napi_prefer_busy_poll, 309 .op_param = ctx->napi_track_mode 310 }; 311 struct io_uring_napi napi; 312 313 if (ctx->flags & IORING_SETUP_IOPOLL) 314 return -EINVAL; 315 if (copy_from_user(&napi, arg, sizeof(napi))) 316 return -EFAULT; 317 if (napi.pad[0] || napi.pad[1] || napi.resv) 318 return -EINVAL; 319 320 if (copy_to_user(arg, &curr, sizeof(curr))) 321 return -EFAULT; 322 323 switch (napi.opcode) { 324 case IO_URING_NAPI_REGISTER_OP: 325 return io_napi_register_napi(ctx, &napi); 326 case IO_URING_NAPI_STATIC_ADD_ID: 327 if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) 328 return -EINVAL; 329 return __io_napi_add_id(ctx, napi.op_param, 330 IO_URING_NAPI_TRACKING_STATIC); 331 case IO_URING_NAPI_STATIC_DEL_ID: 332 if (curr.op_param != IO_URING_NAPI_TRACKING_STATIC) 333 return -EINVAL; 334 return __io_napi_del_id(ctx, napi.op_param); 335 default: 336 return -EINVAL; 337 } 338 } 339 340 /* 341 * io_napi_unregister() - Unregister napi with io-uring 342 * @ctx: pointer to io-uring context structure 343 * @arg: pointer to io_uring_napi structure 344 * 345 * Unregister napi. If arg has been specified copy the busy poll timeout and 346 * prefer busy poll setting to the passed in structure. 347 */ 348 int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) 349 { 350 const struct io_uring_napi curr = { 351 .busy_poll_to = ktime_to_us(ctx->napi_busy_poll_dt), 352 .prefer_busy_poll = ctx->napi_prefer_busy_poll 353 }; 354 355 if (arg && copy_to_user(arg, &curr, sizeof(curr))) 356 return -EFAULT; 357 358 WRITE_ONCE(ctx->napi_track_mode, IO_URING_NAPI_TRACKING_INACTIVE); 359 WRITE_ONCE(ctx->napi_busy_poll_dt, 0); 360 WRITE_ONCE(ctx->napi_prefer_busy_poll, false); 361 io_napi_free(ctx); 362 return 0; 363 } 364 365 /* 366 * __io_napi_busy_loop() - execute busy poll loop 367 * @ctx: pointer to io-uring context structure 368 * @iowq: pointer to io wait queue 369 * 370 * Execute the busy poll loop and merge the spliced off list. 371 */ 372 void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) 373 { 374 if (ctx->flags & IORING_SETUP_SQPOLL) 375 return; 376 377 iowq->napi_busy_poll_dt = READ_ONCE(ctx->napi_busy_poll_dt); 378 if (iowq->timeout != KTIME_MAX) { 379 ktime_t dt = ktime_sub(iowq->timeout, io_get_time(ctx)); 380 381 iowq->napi_busy_poll_dt = min_t(u64, iowq->napi_busy_poll_dt, dt); 382 } 383 384 iowq->napi_prefer_busy_poll = READ_ONCE(ctx->napi_prefer_busy_poll); 385 io_napi_blocking_busy_loop(ctx, iowq); 386 } 387 388 /* 389 * io_napi_sqpoll_busy_poll() - busy poll loop for sqpoll 390 * @ctx: pointer to io-uring context structure 391 * 392 * Splice of the napi list and execute the napi busy poll loop. 393 */ 394 int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) 395 { 396 bool is_stale = false; 397 398 if (!READ_ONCE(ctx->napi_busy_poll_dt)) 399 return 0; 400 if (list_empty_careful(&ctx->napi_list)) 401 return 0; 402 403 scoped_guard(rcu) { 404 is_stale = __io_napi_do_busy_loop(ctx, NULL, NULL); 405 } 406 407 io_napi_remove_stale(ctx, is_stale); 408 return 1; 409 } 410 411 #endif 412