1 // SPDX-License-Identifier: GPL-2.0 2 3 #include "blk-rq-qos.h" 4 5 __read_mostly DEFINE_STATIC_KEY_FALSE(block_rq_qos); 6 7 /* 8 * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, 9 * false if 'v' + 1 would be bigger than 'below'. 10 */ 11 static bool atomic_inc_below(atomic_t *v, unsigned int below) 12 { 13 unsigned int cur = atomic_read(v); 14 15 do { 16 if (cur >= below) 17 return false; 18 } while (!atomic_try_cmpxchg(v, &cur, cur + 1)); 19 20 return true; 21 } 22 23 bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit) 24 { 25 return atomic_inc_below(&rq_wait->inflight, limit); 26 } 27 28 void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio) 29 { 30 do { 31 if (rqos->ops->cleanup) 32 rqos->ops->cleanup(rqos, bio); 33 rqos = rqos->next; 34 } while (rqos); 35 } 36 37 void __rq_qos_done(struct rq_qos *rqos, struct request *rq) 38 { 39 do { 40 if (rqos->ops->done) 41 rqos->ops->done(rqos, rq); 42 rqos = rqos->next; 43 } while (rqos); 44 } 45 46 void __rq_qos_issue(struct rq_qos *rqos, struct request *rq) 47 { 48 do { 49 if (rqos->ops->issue) 50 rqos->ops->issue(rqos, rq); 51 rqos = rqos->next; 52 } while (rqos); 53 } 54 55 void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq) 56 { 57 do { 58 if (rqos->ops->requeue) 59 rqos->ops->requeue(rqos, rq); 60 rqos = rqos->next; 61 } while (rqos); 62 } 63 64 void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio) 65 { 66 do { 67 if (rqos->ops->throttle) 68 rqos->ops->throttle(rqos, bio); 69 rqos = rqos->next; 70 } while (rqos); 71 } 72 73 void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) 74 { 75 do { 76 if (rqos->ops->track) 77 rqos->ops->track(rqos, rq, bio); 78 rqos = rqos->next; 79 } while (rqos); 80 } 81 82 void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) 83 { 84 do { 85 if (rqos->ops->merge) 86 rqos->ops->merge(rqos, rq, bio); 87 rqos = rqos->next; 88 } while (rqos); 89 } 90 91 void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio) 92 { 93 do { 94 if (rqos->ops->done_bio) 95 rqos->ops->done_bio(rqos, bio); 96 rqos = rqos->next; 97 } while (rqos); 98 } 99 100 void __rq_qos_queue_depth_changed(struct rq_qos *rqos) 101 { 102 do { 103 if (rqos->ops->queue_depth_changed) 104 rqos->ops->queue_depth_changed(rqos); 105 rqos = rqos->next; 106 } while (rqos); 107 } 108 109 /* 110 * Return true, if we can't increase the depth further by scaling 111 */ 112 bool rq_depth_calc_max_depth(struct rq_depth *rqd) 113 { 114 unsigned int depth; 115 bool ret = false; 116 117 /* 118 * For QD=1 devices, this is a special case. It's important for those 119 * to have one request ready when one completes, so force a depth of 120 * 2 for those devices. On the backend, it'll be a depth of 1 anyway, 121 * since the device can't have more than that in flight. If we're 122 * scaling down, then keep a setting of 1/1/1. 123 */ 124 if (rqd->queue_depth == 1) { 125 if (rqd->scale_step > 0) 126 rqd->max_depth = 1; 127 else { 128 rqd->max_depth = 2; 129 ret = true; 130 } 131 } else { 132 /* 133 * scale_step == 0 is our default state. If we have suffered 134 * latency spikes, step will be > 0, and we shrink the 135 * allowed write depths. If step is < 0, we're only doing 136 * writes, and we allow a temporarily higher depth to 137 * increase performance. 138 */ 139 depth = min_t(unsigned int, rqd->default_depth, 140 rqd->queue_depth); 141 if (rqd->scale_step > 0) 142 depth = 1 + ((depth - 1) >> min(31, rqd->scale_step)); 143 else if (rqd->scale_step < 0) { 144 unsigned int maxd = 3 * rqd->queue_depth / 4; 145 146 depth = 1 + ((depth - 1) << -rqd->scale_step); 147 if (depth > maxd) { 148 depth = maxd; 149 ret = true; 150 } 151 } 152 153 rqd->max_depth = depth; 154 } 155 156 return ret; 157 } 158 159 /* Returns true on success and false if scaling up wasn't possible */ 160 bool rq_depth_scale_up(struct rq_depth *rqd) 161 { 162 /* 163 * Hit max in previous round, stop here 164 */ 165 if (rqd->scaled_max) 166 return false; 167 168 rqd->scale_step--; 169 170 rqd->scaled_max = rq_depth_calc_max_depth(rqd); 171 return true; 172 } 173 174 /* 175 * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we 176 * had a latency violation. Returns true on success and returns false if 177 * scaling down wasn't possible. 178 */ 179 bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) 180 { 181 /* 182 * Stop scaling down when we've hit the limit. This also prevents 183 * ->scale_step from going to crazy values, if the device can't 184 * keep up. 185 */ 186 if (rqd->max_depth == 1) 187 return false; 188 189 if (rqd->scale_step < 0 && hard_throttle) 190 rqd->scale_step = 0; 191 else 192 rqd->scale_step++; 193 194 rqd->scaled_max = false; 195 rq_depth_calc_max_depth(rqd); 196 return true; 197 } 198 199 struct rq_qos_wait_data { 200 struct wait_queue_entry wq; 201 struct rq_wait *rqw; 202 acquire_inflight_cb_t *cb; 203 void *private_data; 204 bool got_token; 205 }; 206 207 static int rq_qos_wake_function(struct wait_queue_entry *curr, 208 unsigned int mode, int wake_flags, void *key) 209 { 210 struct rq_qos_wait_data *data = container_of(curr, 211 struct rq_qos_wait_data, 212 wq); 213 214 /* 215 * If we fail to get a budget, return -1 to interrupt the wake up loop 216 * in __wake_up_common. 217 */ 218 if (!data->cb(data->rqw, data->private_data)) 219 return -1; 220 221 data->got_token = true; 222 /* 223 * autoremove_wake_function() removes the wait entry only when it 224 * actually changed the task state. We want the wait always removed. 225 * Remove explicitly and use default_wake_function(). 226 */ 227 default_wake_function(curr, mode, wake_flags, key); 228 /* 229 * Note that the order of operations is important as finish_wait() 230 * tests whether @curr is removed without grabbing the lock. This 231 * should be the last thing to do to make sure we will not have a 232 * UAF access to @data. And the semantics of memory barrier in it 233 * also make sure the waiter will see the latest @data->got_token 234 * once list_empty_careful() in finish_wait() returns true. 235 */ 236 list_del_init_careful(&curr->entry); 237 return 1; 238 } 239 240 /** 241 * rq_qos_wait - throttle on a rqw if we need to 242 * @rqw: rqw to throttle on 243 * @private_data: caller provided specific data 244 * @acquire_inflight_cb: inc the rqw->inflight counter if we can 245 * @cleanup_cb: the callback to cleanup in case we race with a waker 246 * 247 * This provides a uniform place for the rq_qos users to do their throttling. 248 * Since you can end up with a lot of things sleeping at once, this manages the 249 * waking up based on the resources available. The acquire_inflight_cb should 250 * inc the rqw->inflight if we have the ability to do so, or return false if not 251 * and then we will sleep until the room becomes available. 252 * 253 * cleanup_cb is in case that we race with a waker and need to cleanup the 254 * inflight count accordingly. 255 */ 256 void rq_qos_wait(struct rq_wait *rqw, void *private_data, 257 acquire_inflight_cb_t *acquire_inflight_cb, 258 cleanup_cb_t *cleanup_cb) 259 { 260 struct rq_qos_wait_data data = { 261 .rqw = rqw, 262 .cb = acquire_inflight_cb, 263 .private_data = private_data, 264 .got_token = false, 265 }; 266 bool first_waiter; 267 268 /* 269 * If there are no waiters in the waiting queue, try to increase the 270 * inflight counter if we can. Otherwise, prepare for adding ourselves 271 * to the waiting queue. 272 */ 273 if (!waitqueue_active(&rqw->wait) && acquire_inflight_cb(rqw, private_data)) 274 return; 275 276 init_wait_func(&data.wq, rq_qos_wake_function); 277 first_waiter = prepare_to_wait_exclusive(&rqw->wait, &data.wq, 278 TASK_UNINTERRUPTIBLE); 279 /* 280 * Make sure there is at least one inflight process; otherwise, waiters 281 * will never be woken up. Since there may be no inflight process before 282 * adding ourselves to the waiting queue above, we need to try to 283 * increase the inflight counter for ourselves. And it is sufficient to 284 * guarantee that at least the first waiter to enter the waiting queue 285 * will re-check the waiting condition before going to sleep, thus 286 * ensuring forward progress. 287 */ 288 if (!data.got_token && first_waiter && acquire_inflight_cb(rqw, private_data)) { 289 finish_wait(&rqw->wait, &data.wq); 290 /* 291 * We raced with rq_qos_wake_function() getting a token, 292 * which means we now have two. Put our local token 293 * and wake anyone else potentially waiting for one. 294 * 295 * Enough memory barrier in list_empty_careful() in 296 * finish_wait() is paired with list_del_init_careful() 297 * in rq_qos_wake_function() to make sure we will see 298 * the latest @data->got_token. 299 */ 300 if (data.got_token) 301 cleanup_cb(rqw, private_data); 302 return; 303 } 304 305 /* we are now relying on the waker to increase our inflight counter. */ 306 do { 307 if (data.got_token) 308 break; 309 io_schedule(); 310 set_current_state(TASK_UNINTERRUPTIBLE); 311 } while (1); 312 finish_wait(&rqw->wait, &data.wq); 313 } 314 315 void rq_qos_exit(struct request_queue *q) 316 { 317 mutex_lock(&q->rq_qos_mutex); 318 while (q->rq_qos) { 319 struct rq_qos *rqos = q->rq_qos; 320 q->rq_qos = rqos->next; 321 rqos->ops->exit(rqos); 322 static_branch_dec(&block_rq_qos); 323 } 324 mutex_unlock(&q->rq_qos_mutex); 325 } 326 327 int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, 328 const struct rq_qos_ops *ops) 329 { 330 struct request_queue *q = disk->queue; 331 unsigned int memflags; 332 333 lockdep_assert_held(&q->rq_qos_mutex); 334 335 rqos->disk = disk; 336 rqos->id = id; 337 rqos->ops = ops; 338 339 /* 340 * No IO can be in-flight when adding rqos, so freeze queue, which 341 * is fine since we only support rq_qos for blk-mq queue. 342 */ 343 memflags = blk_mq_freeze_queue(q); 344 345 if (rq_qos_id(q, rqos->id)) 346 goto ebusy; 347 rqos->next = q->rq_qos; 348 q->rq_qos = rqos; 349 static_branch_inc(&block_rq_qos); 350 351 blk_mq_unfreeze_queue(q, memflags); 352 353 if (rqos->ops->debugfs_attrs) { 354 mutex_lock(&q->debugfs_mutex); 355 blk_mq_debugfs_register_rqos(rqos); 356 mutex_unlock(&q->debugfs_mutex); 357 } 358 359 return 0; 360 ebusy: 361 blk_mq_unfreeze_queue(q, memflags); 362 return -EBUSY; 363 } 364 365 void rq_qos_del(struct rq_qos *rqos) 366 { 367 struct request_queue *q = rqos->disk->queue; 368 struct rq_qos **cur; 369 unsigned int memflags; 370 371 lockdep_assert_held(&q->rq_qos_mutex); 372 373 memflags = blk_mq_freeze_queue(q); 374 for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { 375 if (*cur == rqos) { 376 *cur = rqos->next; 377 break; 378 } 379 } 380 blk_mq_unfreeze_queue(q, memflags); 381 382 mutex_lock(&q->debugfs_mutex); 383 blk_mq_debugfs_unregister_rqos(rqos); 384 mutex_unlock(&q->debugfs_mutex); 385 } 386