1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* 27 * Copyright (c) 2012 by Delphix. All rights reserved. 28 */ 29 30 #include <sys/rrwlock.h> 31 #include <sys/trace_zfs.h> 32 33 /* 34 * This file contains the implementation of a re-entrant read 35 * reader/writer lock (aka "rrwlock"). 36 * 37 * This is a normal reader/writer lock with the additional feature 38 * of allowing threads who have already obtained a read lock to 39 * re-enter another read lock (re-entrant read) - even if there are 40 * waiting writers. 41 * 42 * Callers who have not obtained a read lock give waiting writers priority. 43 * 44 * The rrwlock_t lock does not allow re-entrant writers, nor does it 45 * allow a re-entrant mix of reads and writes (that is, it does not 46 * allow a caller who has already obtained a read lock to be able to 47 * then grab a write lock without first dropping all read locks, and 48 * vice versa). 49 * 50 * The rrwlock_t uses tsd (thread specific data) to keep a list of 51 * nodes (rrw_node_t), where each node keeps track of which specific 52 * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering 53 * should be rare, a thread that grabs multiple reads on the same rrwlock_t 54 * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the 55 * tsd list can represent a different rrwlock_t. This allows a thread 56 * to enter multiple and unique rrwlock_ts for read locks at the same time. 57 * 58 * Since using tsd exposes some overhead, the rrwlock_t only needs to 59 * keep tsd data when writers are waiting. If no writers are waiting, then 60 * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd 61 * is needed. Once a writer attempts to grab the lock, readers then 62 * keep tsd data and bump the linked readers count (rr_linked_rcount). 63 * 64 * If there are waiting writers and there are anonymous readers, then a 65 * reader doesn't know if it is a re-entrant lock. But since it may be one, 66 * we allow the read to proceed (otherwise it could deadlock). Since once 67 * waiting writers are active, readers no longer bump the anonymous count, 68 * the anonymous readers will eventually flush themselves out. At this point, 69 * readers will be able to tell if they are a re-entrant lock (have a 70 * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then 71 * we must let the proceed. If they are not, then the reader blocks for the 72 * waiting writers. Hence, we do not starve writers. 73 */ 74 75 /* global key for TSD */ 76 uint_t rrw_tsd_key; 77 78 typedef struct rrw_node { 79 struct rrw_node *rn_next; 80 rrwlock_t *rn_rrl; 81 const void *rn_tag; 82 } rrw_node_t; 83 84 static rrw_node_t * 85 rrn_find(rrwlock_t *rrl) 86 { 87 rrw_node_t *rn; 88 89 if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0) 90 return (NULL); 91 92 for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 93 if (rn->rn_rrl == rrl) 94 return (rn); 95 } 96 return (NULL); 97 } 98 99 /* 100 * Add a node to the head of the singly linked list. 101 */ 102 static void 103 rrn_add(rrwlock_t *rrl, const void *tag) 104 { 105 rrw_node_t *rn; 106 107 rn = kmem_alloc(sizeof (*rn), KM_SLEEP); 108 rn->rn_rrl = rrl; 109 rn->rn_next = tsd_get(rrw_tsd_key); 110 rn->rn_tag = tag; 111 VERIFY0(tsd_set(rrw_tsd_key, rn)); 112 } 113 114 /* 115 * If a node is found for 'rrl', then remove the node from this 116 * thread's list and return TRUE; otherwise return FALSE. 117 */ 118 static boolean_t 119 rrn_find_and_remove(rrwlock_t *rrl, const void *tag) 120 { 121 rrw_node_t *rn; 122 rrw_node_t *prev = NULL; 123 124 if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0) 125 return (B_FALSE); 126 127 for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { 128 if (rn->rn_rrl == rrl && rn->rn_tag == tag) { 129 if (prev) 130 prev->rn_next = rn->rn_next; 131 else 132 VERIFY0(tsd_set(rrw_tsd_key, rn->rn_next)); 133 kmem_free(rn, sizeof (*rn)); 134 return (B_TRUE); 135 } 136 prev = rn; 137 } 138 return (B_FALSE); 139 } 140 141 void 142 rrw_init(rrwlock_t *rrl, boolean_t track_all) 143 { 144 mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); 145 cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); 146 rrl->rr_writer = NULL; 147 zfs_refcount_create(&rrl->rr_anon_rcount); 148 zfs_refcount_create(&rrl->rr_linked_rcount); 149 rrl->rr_writer_wanted = B_FALSE; 150 rrl->rr_track_all = track_all; 151 } 152 153 void 154 rrw_destroy(rrwlock_t *rrl) 155 { 156 mutex_destroy(&rrl->rr_lock); 157 cv_destroy(&rrl->rr_cv); 158 ASSERT0P(rrl->rr_writer); 159 zfs_refcount_destroy(&rrl->rr_anon_rcount); 160 zfs_refcount_destroy(&rrl->rr_linked_rcount); 161 } 162 163 static void 164 rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, const void *tag) 165 { 166 mutex_enter(&rrl->rr_lock); 167 #if !defined(ZFS_DEBUG) && defined(_KERNEL) 168 if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && 169 !rrl->rr_track_all) { 170 rrl->rr_anon_rcount.rc_count++; 171 mutex_exit(&rrl->rr_lock); 172 return; 173 } 174 DTRACE_PROBE(zfs__rrwfastpath__rdmiss); 175 #endif 176 ASSERT(rrl->rr_writer != curthread); 177 ASSERT(zfs_refcount_count(&rrl->rr_anon_rcount) >= 0); 178 179 while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && 180 zfs_refcount_is_zero(&rrl->rr_anon_rcount) && !prio && 181 rrn_find(rrl) == NULL)) 182 cv_wait(&rrl->rr_cv, &rrl->rr_lock); 183 184 if (rrl->rr_writer_wanted || rrl->rr_track_all) { 185 /* may or may not be a re-entrant enter */ 186 rrn_add(rrl, tag); 187 (void) zfs_refcount_add(&rrl->rr_linked_rcount, tag); 188 } else { 189 (void) zfs_refcount_add(&rrl->rr_anon_rcount, tag); 190 } 191 ASSERT0P(rrl->rr_writer); 192 mutex_exit(&rrl->rr_lock); 193 } 194 195 void 196 rrw_enter_read(rrwlock_t *rrl, const void *tag) 197 { 198 rrw_enter_read_impl(rrl, B_FALSE, tag); 199 } 200 201 /* 202 * take a read lock even if there are pending write lock requests. if we want 203 * to take a lock reentrantly, but from different threads (that have a 204 * relationship to each other), the normal detection mechanism to overrule 205 * the pending writer does not work, so we have to give an explicit hint here. 206 */ 207 void 208 rrw_enter_read_prio(rrwlock_t *rrl, const void *tag) 209 { 210 rrw_enter_read_impl(rrl, B_TRUE, tag); 211 } 212 213 214 void 215 rrw_enter_write(rrwlock_t *rrl) 216 { 217 mutex_enter(&rrl->rr_lock); 218 ASSERT(rrl->rr_writer != curthread); 219 220 while (zfs_refcount_count(&rrl->rr_anon_rcount) > 0 || 221 zfs_refcount_count(&rrl->rr_linked_rcount) > 0 || 222 rrl->rr_writer != NULL) { 223 rrl->rr_writer_wanted = B_TRUE; 224 cv_wait(&rrl->rr_cv, &rrl->rr_lock); 225 } 226 rrl->rr_writer_wanted = B_FALSE; 227 rrl->rr_writer = curthread; 228 mutex_exit(&rrl->rr_lock); 229 } 230 231 void 232 rrw_enter(rrwlock_t *rrl, krw_t rw, const void *tag) 233 { 234 if (rw == RW_READER) 235 rrw_enter_read(rrl, tag); 236 else 237 rrw_enter_write(rrl); 238 } 239 240 void 241 rrw_exit(rrwlock_t *rrl, const void *tag) 242 { 243 mutex_enter(&rrl->rr_lock); 244 #if !defined(ZFS_DEBUG) && defined(_KERNEL) 245 if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { 246 rrl->rr_anon_rcount.rc_count--; 247 if (rrl->rr_anon_rcount.rc_count == 0) 248 cv_broadcast(&rrl->rr_cv); 249 mutex_exit(&rrl->rr_lock); 250 return; 251 } 252 DTRACE_PROBE(zfs__rrwfastpath__exitmiss); 253 #endif 254 ASSERT(!zfs_refcount_is_zero(&rrl->rr_anon_rcount) || 255 !zfs_refcount_is_zero(&rrl->rr_linked_rcount) || 256 rrl->rr_writer != NULL); 257 258 if (rrl->rr_writer == NULL) { 259 int64_t count; 260 if (rrn_find_and_remove(rrl, tag)) { 261 count = zfs_refcount_remove( 262 &rrl->rr_linked_rcount, tag); 263 } else { 264 ASSERT(!rrl->rr_track_all); 265 count = zfs_refcount_remove(&rrl->rr_anon_rcount, tag); 266 } 267 if (count == 0) 268 cv_broadcast(&rrl->rr_cv); 269 } else { 270 ASSERT(rrl->rr_writer == curthread); 271 ASSERT(zfs_refcount_is_zero(&rrl->rr_anon_rcount) && 272 zfs_refcount_is_zero(&rrl->rr_linked_rcount)); 273 rrl->rr_writer = NULL; 274 cv_broadcast(&rrl->rr_cv); 275 } 276 mutex_exit(&rrl->rr_lock); 277 } 278 279 /* 280 * If the lock was created with track_all, rrw_held(RW_READER) will return 281 * B_TRUE iff the current thread has the lock for reader. Otherwise it may 282 * return B_TRUE if any thread has the lock for reader. 283 */ 284 boolean_t 285 rrw_held(rrwlock_t *rrl, krw_t rw) 286 { 287 boolean_t held; 288 289 mutex_enter(&rrl->rr_lock); 290 if (rw == RW_WRITER) { 291 held = (rrl->rr_writer == curthread); 292 } else { 293 held = (!zfs_refcount_is_zero(&rrl->rr_anon_rcount) || 294 rrn_find(rrl) != NULL); 295 } 296 mutex_exit(&rrl->rr_lock); 297 298 return (held); 299 } 300 301 void 302 rrw_tsd_destroy(void *arg) 303 { 304 rrw_node_t *rn = arg; 305 if (rn != NULL) { 306 panic("thread %p terminating with rrw lock %p held", 307 (void *)curthread, (void *)rn->rn_rrl); 308 } 309 } 310 311 /* 312 * A reader-mostly lock implementation, tuning above reader-writer locks 313 * for hightly parallel read acquisitions, while pessimizing writes. 314 * 315 * The idea is to split single busy lock into array of locks, so that 316 * each reader can lock only one of them for read, depending on result 317 * of simple hash function. That proportionally reduces lock congestion. 318 * Writer at the same time has to sequentially acquire write on all the locks. 319 * That makes write acquisition proportionally slower, but in places where 320 * it is used (filesystem unmount) performance is not critical. 321 * 322 * All the functions below are direct wrappers around functions above. 323 */ 324 void 325 rrm_init(rrmlock_t *rrl, boolean_t track_all) 326 { 327 int i; 328 329 for (i = 0; i < RRM_NUM_LOCKS; i++) 330 rrw_init(&rrl->locks[i], track_all); 331 } 332 333 void 334 rrm_destroy(rrmlock_t *rrl) 335 { 336 int i; 337 338 for (i = 0; i < RRM_NUM_LOCKS; i++) 339 rrw_destroy(&rrl->locks[i]); 340 } 341 342 void 343 rrm_enter(rrmlock_t *rrl, krw_t rw, const void *tag) 344 { 345 if (rw == RW_READER) 346 rrm_enter_read(rrl, tag); 347 else 348 rrm_enter_write(rrl); 349 } 350 351 /* 352 * This maps the current thread to a specific lock. Note that the lock 353 * must be released by the same thread that acquired it. We do this 354 * mapping by taking the thread pointer mod a prime number. We examine 355 * only the low 32 bits of the thread pointer, because 32-bit division 356 * is faster than 64-bit division, and the high 32 bits have little 357 * entropy anyway. 358 */ 359 #define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS) 360 361 void 362 rrm_enter_read(rrmlock_t *rrl, const void *tag) 363 { 364 rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag); 365 } 366 367 void 368 rrm_enter_write(rrmlock_t *rrl) 369 { 370 int i; 371 372 for (i = 0; i < RRM_NUM_LOCKS; i++) 373 rrw_enter_write(&rrl->locks[i]); 374 } 375 376 void 377 rrm_exit(rrmlock_t *rrl, const void *tag) 378 { 379 int i; 380 381 if (rrl->locks[0].rr_writer == curthread) { 382 for (i = 0; i < RRM_NUM_LOCKS; i++) 383 rrw_exit(&rrl->locks[i], tag); 384 } else { 385 rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag); 386 } 387 } 388 389 boolean_t 390 rrm_held(rrmlock_t *rrl, krw_t rw) 391 { 392 if (rw == RW_WRITER) { 393 return (rrw_held(&rrl->locks[0], rw)); 394 } else { 395 return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw)); 396 } 397 } 398