1 /*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12 /*
13 * Copyright 2024 Oxide Computer Company
14 */
15
16 /*
17 * This implements the general locking routines. See the big theory section
18 * 'ioctls, Errors, and Exclusive Access' for more information.
19 */
20
21 #include <sys/stddef.h>
22 #include <sys/nvme.h>
23
24 #include "nvme_reg.h"
25 #include "nvme_var.h"
26
27 /*
28 * Do we have a writer or someone pending. Note, some cases require checking
29 * both of these and others do not. Please see each individual check for the
30 * nuance here. As a general rule of thumb, when locking, the pending writers
31 * are important. However, when passing the lock on to the next owner (the
32 * handoff functions below), one doesn't check it.
33 */
34 static boolean_t
nvme_rwlock_wr_or_pend(nvme_lock_t * lock)35 nvme_rwlock_wr_or_pend(nvme_lock_t *lock)
36 {
37 return (lock->nl_writer != NULL ||
38 list_is_empty(&lock->nl_pend_writers) == 0);
39 }
40
41 /*
42 * Taking a namespace read lock requires that there is no writer (or pending) on
43 * the controller and the namespace.
44 */
45 static boolean_t
nvme_rwlock_block_ns_rdlock(nvme_t * nvme,nvme_namespace_t * ns)46 nvme_rwlock_block_ns_rdlock(nvme_t *nvme, nvme_namespace_t *ns)
47 {
48 return (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
49 nvme_rwlock_wr_or_pend(&ns->ns_lock));
50 }
51
52 /*
53 * The following entities all block a namespace write lock from being taken:
54 *
55 * 1) Any active or pending writer on the controller lock. They block and starve
56 * namespace writers respectively.
57 * 2) Any active or pending writers on the namespace lock. We must wait in line.
58 * 3) Any active readers on the namespace lock. We ignore pending namespace
59 * readers as by definition that implies some other situation will cause
60 * this.
61 */
62 static boolean_t
nvme_rwlock_block_ns_wrlock(nvme_t * nvme,nvme_namespace_t * ns)63 nvme_rwlock_block_ns_wrlock(nvme_t *nvme, nvme_namespace_t *ns)
64 {
65 return (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
66 nvme_rwlock_wr_or_pend(&ns->ns_lock) ||
67 list_is_empty(&ns->ns_lock.nl_readers) == 0);
68 }
69
70 /*
71 * The only thing that blocks acquisition of a controller read lock is if
72 * there are outstanding or pending writers on the controller lock. We can
73 * ignore the state of all namespaces here.
74 */
75 static boolean_t
nvme_rwlock_block_ctrl_rdlock(nvme_t * nvme)76 nvme_rwlock_block_ctrl_rdlock(nvme_t *nvme)
77 {
78 return (nvme_rwlock_wr_or_pend(&nvme->n_lock));
79 }
80
81 /*
82 * Taking the controller write lock is the most challenging of all, but also
83 * takes priority. The following all block a controller write lock from being
84 * taken:
85 *
86 * 1) Any controller write lock or pending write
87 * 2) Any controller read lock. We skip pending reads because if they exist,
88 * some other situation causes that that will trip us.
89 * 3) Any namespace having a write lock. We ignore pending writes because by
90 * definition there is some condition that causes that to be the case.
91 * 4) Any read lock on a namespace. We ignore pending reads like in the
92 * controller case.
93 */
94 static boolean_t
nvme_rwlock_block_ctrl_wrlock(nvme_t * nvme)95 nvme_rwlock_block_ctrl_wrlock(nvme_t *nvme)
96 {
97 if (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
98 list_is_empty(&nvme->n_lock.nl_readers) == 0) {
99 return (B_TRUE);
100 }
101
102 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
103 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
104 if (ns->ns_lock.nl_writer != NULL ||
105 list_is_empty(&ns->ns_lock.nl_readers) == 0) {
106 return (B_TRUE);
107 }
108 }
109
110 return (B_FALSE);
111 }
112
113 /*
114 * Answer can we hand off the world to a pending controller write lock. This has
115 * similar rules to the above; however, we critically _ignore_ pending
116 * controller write lock holds, as the assumption is that they are here, so the
117 * only consideration from above are controller reader locks and namespace
118 * locks.
119 */
120 static boolean_t
nvme_rwlock_handoff_ctrl_wrlock(nvme_t * nvme)121 nvme_rwlock_handoff_ctrl_wrlock(nvme_t *nvme)
122 {
123 /* See nvme_rwlock_wakeup() for on why this can be done. */
124 ASSERT3P(nvme->n_lock.nl_writer, ==, NULL);
125
126 if (list_is_empty(&nvme->n_lock.nl_readers) == 0) {
127 return (B_FALSE);
128 }
129
130 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
131 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
132 if (ns->ns_lock.nl_writer != NULL ||
133 list_is_empty(&ns->ns_lock.nl_readers) == 0) {
134 return (B_FALSE);
135 }
136 }
137
138 return (B_TRUE);
139 }
140
141 /*
142 * Namespace handoff variant. It skips pending writers on the namespace lock,
143 * but fully considers them on the controller due to their priority. Otherwise
144 * this follows the same rules as the normal blocking check.
145 */
146 static boolean_t
nvme_rwlock_handoff_ns_wrlock(nvme_t * nvme,nvme_namespace_t * ns)147 nvme_rwlock_handoff_ns_wrlock(nvme_t *nvme, nvme_namespace_t *ns)
148 {
149 if (nvme_rwlock_wr_or_pend(&nvme->n_lock) ||
150 list_is_empty(&nvme->n_lock.nl_readers) == 0) {
151 return (B_FALSE);
152 }
153
154 if (ns->ns_lock.nl_writer != NULL ||
155 list_is_empty(&ns->ns_lock.nl_readers) == 0) {
156 return (B_FALSE);
157 }
158
159 return (B_TRUE);
160 }
161
162 static void
nvme_rwlock_rdlock(nvme_minor_lock_info_t * info,nvme_lock_t * lock)163 nvme_rwlock_rdlock(nvme_minor_lock_info_t *info, nvme_lock_t *lock)
164 {
165 ASSERT3U(list_is_empty(&lock->nl_pend_writers), !=, 0);
166 ASSERT3P(lock->nl_writer, ==, NULL);
167 ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_UNLOCKED);
168 ASSERT3U(list_link_active(&info->nli_node), ==, 0);
169 ASSERT3P(info->nli_minor, !=, NULL);
170 ASSERT3P(info->nli_nvme, !=, NULL);
171 ASSERT3U(info->nli_curlevel, ==, NVME_LOCK_L_READ);
172
173 info->nli_state = NVME_LOCK_STATE_ACQUIRED;
174 info->nli_last_change = gethrtime();
175 info->nli_acq_kthread = (uintptr_t)curthread;
176 info->nli_acq_pid = (uint32_t)curproc->p_pid;
177
178 list_insert_tail(&lock->nl_readers, info);
179 lock->nl_nread_locks++;
180 }
181
182 static void
nvme_rwlock_wrlock(nvme_minor_lock_info_t * info,nvme_lock_t * lock)183 nvme_rwlock_wrlock(nvme_minor_lock_info_t *info, nvme_lock_t *lock)
184 {
185 ASSERT3P(lock->nl_writer, ==, NULL);
186 ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_UNLOCKED);
187 ASSERT3U(list_link_active(&info->nli_node), ==, 0);
188 ASSERT3P(info->nli_minor, !=, NULL);
189 ASSERT3P(info->nli_nvme, !=, NULL);
190
191 info->nli_state = NVME_LOCK_STATE_ACQUIRED;
192 info->nli_curlevel = NVME_LOCK_L_WRITE;
193 info->nli_last_change = gethrtime();
194 info->nli_acq_kthread = (uintptr_t)curthread;
195 info->nli_acq_pid = (uint32_t)curproc->p_pid;
196
197 lock->nl_writer = info;
198 lock->nl_nwrite_locks++;
199 }
200
201 #ifdef DEBUG
202 /*
203 * This is just a sanity check for our lock logic.
204 */
205 static boolean_t
nvme_rwlock_is_reader(nvme_lock_t * lock,const nvme_minor_lock_info_t * info)206 nvme_rwlock_is_reader(nvme_lock_t *lock, const nvme_minor_lock_info_t *info)
207 {
208 for (nvme_minor_lock_info_t *i = list_head(&lock->nl_readers);
209 i != NULL; i = list_next(&lock->nl_readers, i)) {
210 if (i == info) {
211 return (B_TRUE);
212 }
213 }
214 return (B_FALSE);
215 }
216 #endif
217
218 static void
nvme_rwlock_signal_one(nvme_minor_lock_info_t * info,nvme_ioctl_errno_t err)219 nvme_rwlock_signal_one(nvme_minor_lock_info_t *info, nvme_ioctl_errno_t err)
220 {
221 ASSERT3P(info->nli_ioc, !=, NULL);
222 ASSERT3P(info->nli_minor, !=, NULL);
223 ASSERT3P(info->nli_state, !=, NVME_LOCK_STATE_BLOCKED);
224
225 if (err == NVME_IOCTL_E_OK) {
226 nvme_ioctl_success(info->nli_ioc);
227 } else {
228 (void) nvme_ioctl_error(info->nli_ioc, err, 0, 0);
229 }
230
231 cv_signal(&info->nli_minor->nm_cv);
232 }
233
234 static void
nvme_rwlock_wakeup_readers(nvme_lock_t * lock)235 nvme_rwlock_wakeup_readers(nvme_lock_t *lock)
236 {
237 nvme_minor_lock_info_t *info;
238
239 if (list_is_empty(&lock->nl_pend_readers) != 0) {
240 return;
241 }
242
243 ASSERT3U(list_is_empty(&lock->nl_readers), !=, 0);
244 ASSERT3P(lock->nl_writer, ==, NULL);
245 ASSERT3U(list_is_empty(&lock->nl_pend_writers), !=, 0);
246 while ((info = list_remove_head(&lock->nl_pend_readers)) != NULL) {
247 info->nli_state = NVME_LOCK_STATE_UNLOCKED;
248 nvme_rwlock_rdlock(info, lock);
249 nvme_rwlock_signal_one(info, NVME_IOCTL_E_OK);
250 }
251 }
252
253 /*
254 * An unlock occurred somewhere. We need to evaluate the total state of the
255 * world. An unlock of a namespace can allow a controller lock to proceed. On
256 * the other hand, dropping the controller write lock allows every namespace to
257 * proceed. While we know the context of where the unlock occurred, it's simpler
258 * right now to just allow everything to continue. This is somewhat expensive,
259 * but this can be sped up with more cached information when it's justified. We
260 * process things in the following order:
261 *
262 * 1) Evaluate if someone can now take a controller write lock. If so, wake up
263 * the head of the list and then all subsequent processing is done.
264 * 2) Evaluate if there are pending readers for the controller. If so, wake up
265 * each and every waiter. Always continue to namespaces in this case.
266 *
267 * For each namespace:
268 *
269 * 1) Evaluate if there are pending writers and they can take the write lock. If
270 * so, wake up the head of the list. If so, continue to the next namespace.
271 * 2) Otherwise, if there are pending readers. If so, wake up each and every
272 * reader. Continue onto the next namespace.
273 */
274 static void
nvme_rwlock_wakeup(nvme_t * nvme)275 nvme_rwlock_wakeup(nvme_t *nvme)
276 {
277 nvme_lock_t *ctrl_lock = &nvme->n_lock;
278
279 /*
280 * This assertion may seem weird, but it's actually a bit of an
281 * invariant. When the controller's write lock is taken, by definition
282 * there are no other locks that can be taken. Therefore if we were
283 * somehow unable to unlock a lock on this controller, then we'd be
284 * violating our rules.
285 */
286 VERIFY3P(ctrl_lock->nl_writer, ==, NULL);
287
288 /*
289 * If there are pending writers, either one of them will be woken up or
290 * no one will. Writers trump readers, but it's possible that we may not
291 * be able to wake up a waiting writer yet. If we take this arm, we
292 * should not process anything else. The same logic applies in the
293 * namespace case as well.
294 */
295 if (list_is_empty(&ctrl_lock->nl_pend_writers) == 0) {
296 nvme_minor_lock_info_t *info;
297
298 if (!nvme_rwlock_handoff_ctrl_wrlock(nvme))
299 return;
300
301 /*
302 * We opt to indicate that this is unlocked ahead of
303 * taking the lock for state tracking purposes.
304 */
305 info = list_remove_head(&ctrl_lock->nl_pend_writers);
306 info->nli_state = NVME_LOCK_STATE_UNLOCKED;
307 nvme_rwlock_wrlock(info, ctrl_lock);
308 nvme_rwlock_signal_one(info, NVME_IOCTL_E_OK);
309 return;
310 }
311
312 nvme_rwlock_wakeup_readers(ctrl_lock);
313 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
314 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
315 nvme_lock_t *ns_lock = &ns->ns_lock;
316
317 if (list_is_empty(&ns_lock->nl_pend_writers) == 0) {
318 nvme_minor_lock_info_t *info;
319
320 if (!nvme_rwlock_handoff_ns_wrlock(nvme, ns))
321 continue;
322
323 info = list_remove_head(&ns_lock->nl_pend_writers);
324 info->nli_state = NVME_LOCK_STATE_UNLOCKED;
325 nvme_rwlock_wrlock(info, ns_lock);
326 nvme_rwlock_signal_one(info, NVME_IOCTL_E_OK);
327 } else {
328 nvme_rwlock_wakeup_readers(ns_lock);
329 }
330 }
331 }
332
333 /*
334 * This cleans up all the state in the minor for returning without a lock held.
335 */
336 static void
nvme_rwunlock_cleanup_minor(nvme_minor_lock_info_t * info)337 nvme_rwunlock_cleanup_minor(nvme_minor_lock_info_t *info)
338 {
339 info->nli_lock = NULL;
340 info->nli_state = NVME_LOCK_STATE_UNLOCKED;
341 info->nli_curlevel = 0;
342 info->nli_ns = NULL;
343 }
344
345 /*
346 * We've been asked to unlock a lock. Not only must we remove our hold from this
347 * lock, we must go through and wake up the next waiter. The waiters that we
348 * have to wake up vary depending on our lock. See section 'ioctls, Errors, and
349 * Exclusive Access' in the theory statement for more information.
350 */
351
352 void
nvme_rwunlock(nvme_minor_lock_info_t * info,nvme_lock_t * lock)353 nvme_rwunlock(nvme_minor_lock_info_t *info, nvme_lock_t *lock)
354 {
355 nvme_t *const nvme = info->nli_nvme;
356 boolean_t is_read;
357
358 VERIFY(MUTEX_HELD(&nvme->n_minor_mutex));
359 VERIFY3P(info->nli_lock, ==, lock);
360 VERIFY(info->nli_curlevel == NVME_LOCK_L_READ ||
361 info->nli_curlevel == NVME_LOCK_L_WRITE);
362 is_read = info->nli_curlevel == NVME_LOCK_L_READ;
363
364 /*
365 * First we need to remove this minor from the lock and clean up all of
366 * the state this lock in the info structure.
367 */
368 info->nli_last_change = gethrtime();
369 if (is_read) {
370 VERIFY3U(list_link_active(&info->nli_node), !=, 0);
371 ASSERT3U(nvme_rwlock_is_reader(lock, info), ==, B_TRUE);
372 list_remove(&lock->nl_readers, info);
373 } else {
374 VERIFY3U(list_link_active(&info->nli_node), ==, 0);
375 VERIFY3P(lock->nl_writer, ==, info);
376 lock->nl_writer = NULL;
377 }
378
379 nvme_rwunlock_cleanup_minor(info);
380 nvme_rwlock_wakeup(nvme);
381 }
382
383 /*
384 * We were just interrupted due to a signal. However, just because our block was
385 * interrupted due to a signal doesn't mean that other activity didn't occur. In
386 * particular, the signal wake up could race with a subsequent wake up that was
387 * due to the device being removed or actually acquiring the lock. Depending on
388 * which state we were in, we need to perform the appropriate clean up. In all
389 * cases, the signal trumps all, which may mean actually unlocking!
390 */
391 static void
nvme_rwlock_signal(nvme_minor_lock_info_t * info,nvme_lock_t * lock,boolean_t is_read)392 nvme_rwlock_signal(nvme_minor_lock_info_t *info, nvme_lock_t *lock,
393 boolean_t is_read)
394 {
395 ASSERT3P(info->nli_ioc, !=, NULL);
396
397 /*
398 * We're changing the state here, so update the minor's last change
399 * time.
400 */
401 info->nli_last_change = gethrtime();
402 lock->nl_nsignals++;
403
404 /*
405 * This is the simplest case. We've already been removed from the lock
406 * that we're on. All we need to do is change the error to indicate that
407 * we received a signal.
408 */
409 if (info->nli_state == NVME_LOCK_STATE_UNLOCKED) {
410 ASSERT3P(info->nli_lock, ==, NULL);
411 (void) nvme_ioctl_error(info->nli_ioc,
412 NVME_IOCTL_E_LOCK_WAIT_SIGNAL, 0, 0);
413 lock->nl_nsig_unlock++;
414 return;
415 }
416
417 /*
418 * For all others, the lock should be set here.
419 */
420 ASSERT3P(info->nli_lock, ==, lock);
421
422 /*
423 * For someone that was blocked, we need to remove them from the pending
424 * lists.
425 */
426 if (info->nli_state == NVME_LOCK_STATE_BLOCKED) {
427 ASSERT3S(list_link_active(&info->nli_node), !=, 0);
428 if (is_read) {
429 list_remove(&lock->nl_pend_readers, info);
430 } else {
431 list_remove(&lock->nl_pend_writers, info);
432 }
433
434 nvme_rwunlock_cleanup_minor(info);
435 (void) nvme_ioctl_error(info->nli_ioc,
436 NVME_IOCTL_E_LOCK_WAIT_SIGNAL, 0, 0);
437 lock->nl_nsig_blocks++;
438 return;
439 }
440
441 /*
442 * Now, the most nuanced thing that we need to do. We need to unlock
443 * this node. We synthesize an unlock request and submit that.
444 */
445 lock->nl_nsig_acq++;
446 nvme_rwunlock(info, lock);
447 }
448
449 /*
450 * Here we need to implement our read-write lock policy. Refer to the big theory
451 * statement for more information. Here's a summary of the priority that's
452 * relevant here:
453 *
454 * 1) Waiting writers starve waiting readers
455 * 2) Waiting writers for the controller starve all namespace writers and
456 * readers
457 * 3) A read lock can be taken if there are no pending or active writers on the
458 * lock (and the controller lock for a namespace).
459 */
460 void
nvme_rwlock(nvme_minor_t * minor,nvme_ioctl_lock_t * req)461 nvme_rwlock(nvme_minor_t *minor, nvme_ioctl_lock_t *req)
462 {
463 nvme_t *const nvme = minor->nm_ctrl;
464 const boolean_t is_nonblock = (req->nil_flags &
465 NVME_LOCK_F_DONT_BLOCK) != 0;
466 const boolean_t is_read = req->nil_level == NVME_LOCK_L_READ;
467 const boolean_t is_ctrl = req->nil_ent == NVME_LOCK_E_CTRL;
468 nvme_minor_lock_info_t *info;
469 nvme_lock_t *lock;
470 boolean_t waiters;
471 hrtime_t sleep_time;
472
473 VERIFY(MUTEX_HELD(&nvme->n_minor_mutex));
474
475 if (is_ctrl) {
476 info = &minor->nm_ctrl_lock;
477 lock = &nvme->n_lock;
478
479 if (is_read) {
480 waiters = nvme_rwlock_block_ctrl_rdlock(nvme);
481 } else {
482 waiters = nvme_rwlock_block_ctrl_wrlock(nvme);
483 }
484 } else {
485 nvme_namespace_t *ns;
486 const uint32_t nsid = req->nil_common.nioc_nsid;
487 info = &minor->nm_ns_lock;
488
489 VERIFY3U(req->nil_ent, ==, NVME_LOCK_E_NS);
490 ns = nvme_nsid2ns(nvme, nsid);
491 minor->nm_ns_lock.nli_ns = ns;
492 lock = &ns->ns_lock;
493
494 if (is_read) {
495 waiters = nvme_rwlock_block_ns_rdlock(nvme, ns);
496 } else {
497 waiters = nvme_rwlock_block_ns_wrlock(nvme, ns);
498 }
499 }
500
501 /*
502 * Set the information that indicates what kind of lock we're attempting
503 * to acquire and that we're operating on.
504 */
505 info->nli_curlevel = is_read ? NVME_LOCK_L_READ : NVME_LOCK_L_WRITE;
506 info->nli_lock = lock;
507
508 /*
509 * We think we can get the lock, hurrah.
510 */
511 if (!waiters) {
512 if (is_read) {
513 nvme_rwlock_rdlock(info, lock);
514 } else {
515 nvme_rwlock_wrlock(info, lock);
516 }
517 (void) nvme_ioctl_success(&req->nil_common);
518 return;
519 }
520
521 /*
522 * We failed to get the lock. At this point we will set ourselves up to
523 * block. Once we go to sleep on the CV, our assumption is that anyone
524 * who has woken us up will have filled in the information the status of
525 * this operation and therefore after this point, all we have to do is
526 * return.
527 */
528 if (is_nonblock) {
529 nvme_rwunlock_cleanup_minor(info);
530 lock->nl_nnonblock++;
531 (void) nvme_ioctl_error(&req->nil_common,
532 NVME_IOCTL_E_LOCK_WOULD_BLOCK, 0, 0);
533 return;
534 }
535
536 ASSERT3P(info->nli_ioc, ==, NULL);
537 info->nli_ioc = &req->nil_common;
538 if (is_read) {
539 list_insert_tail(&lock->nl_pend_readers, info);
540 lock->nl_npend_reads++;
541 } else {
542 list_insert_tail(&lock->nl_pend_writers, info);
543 lock->nl_npend_writes++;
544 }
545
546 ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_UNLOCKED);
547 info->nli_state = NVME_LOCK_STATE_BLOCKED;
548 sleep_time = gethrtime();
549 info->nli_last_change = sleep_time;
550 while (info->nli_state == NVME_LOCK_STATE_BLOCKED) {
551 /*
552 * Block until we receive a signal. Note, a signal trumps all
553 * other processing. We may be woken up here because we acquired
554 * a lock, we may also end up woken up here if the controller is
555 * marked as dead.
556 */
557 if (cv_wait_sig(&minor->nm_cv, &nvme->n_minor_mutex) == 0) {
558 nvme_rwlock_signal(info, lock, is_read);
559 break;
560 }
561 }
562
563 /*
564 * Before we return, clean up and sanity check our state.
565 */
566 info->nli_ioc = NULL;
567 #ifdef DEBUG
568 ASSERT3S(info->nli_last_change, !=, sleep_time);
569 if (info->nli_state == NVME_LOCK_STATE_UNLOCKED) {
570 ASSERT3S(list_link_active(&info->nli_node), ==, 0);
571 ASSERT3P(info->nli_ns, ==, NULL);
572 ASSERT3U(req->nil_common.nioc_drv_err, !=, NVME_IOCTL_E_OK);
573 } else {
574 ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_ACQUIRED);
575 ASSERT3U(req->nil_common.nioc_drv_err, ==, NVME_IOCTL_E_OK);
576 if (is_read) {
577 ASSERT3S(list_link_active(&info->nli_node), !=, 0);
578 } else {
579 ASSERT3P(lock->nl_writer, ==, info);
580 }
581 }
582 ASSERT3P(info->nli_minor, ==, minor);
583 ASSERT3P(info->nli_nvme, ==, minor->nm_ctrl);
584 #endif
585 }
586
587 /*
588 * This is used to clean up a single minor that was blocking trying to get a
589 * lock prior to a controller going dead. In particular, the key here is we need
590 * to change its state to unlocked by cleaning it up and then signal it to wake
591 * up and process things. The clean up also helps deal with the case of a racing
592 * signal, though it does leave the state a little awkward in this intermediate
593 * moment; however, since it's been removed from a list that's really the proper
594 * action and no one can issue new lock ioctls at this point.
595 */
596 static void
nvme_rwlock_ctrl_dead_cleanup_one(nvme_t * nvme,nvme_minor_lock_info_t * info)597 nvme_rwlock_ctrl_dead_cleanup_one(nvme_t *nvme, nvme_minor_lock_info_t *info)
598 {
599 ASSERT3U(info->nli_state, ==, NVME_LOCK_STATE_BLOCKED);
600 ASSERT3P(info->nli_ioc, !=, NULL);
601
602 /*
603 * Update the last time this has changed for our snaity checks.
604 */
605 info->nli_last_change = gethrtime();
606 nvme_rwunlock_cleanup_minor(info);
607 nvme_rwlock_signal_one(info, nvme->n_dead_status);
608 }
609
610 /*
611 * We've just been informed that this controller has set n_dead. This is most
612 * unfortunate for anyone trying to actively use it right now and we must notify
613 * them. Anyone who has successfully obtained a lock gets to keep it until they
614 * drop it (hopefully soon). Anyone who is asleep should be kicked out being
615 * told they are not getting it.
616 *
617 * The moment we grab n_minor_mutex, no other state here can change. So we can
618 * go ahead and wake up all waiters with impunity. This is being called from the
619 * nvme_dead_taskq.
620 */
621 void
nvme_rwlock_ctrl_dead(void * arg)622 nvme_rwlock_ctrl_dead(void *arg)
623 {
624 nvme_t *nvme = arg;
625 nvme_lock_t *ctrl_lock = &nvme->n_lock;
626 nvme_minor_lock_info_t *info;
627
628 mutex_enter(&nvme->n_minor_mutex);
629 for (uint32_t i = 1; i <= nvme->n_namespace_count; i++) {
630 nvme_namespace_t *ns = nvme_nsid2ns(nvme, i);
631 nvme_lock_t *ns_lock = &ns->ns_lock;
632
633 while ((info = list_remove_head(&ns_lock->nl_pend_readers)) !=
634 NULL) {
635 nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
636 }
637
638 while ((info = list_remove_head(&ns_lock->nl_pend_writers)) !=
639 NULL) {
640 nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
641 }
642 }
643
644 while ((info = list_remove_head(&ctrl_lock->nl_pend_readers)) != NULL) {
645 nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
646 }
647
648 while ((info = list_remove_head(&ctrl_lock->nl_pend_writers)) != NULL) {
649
650 nvme_rwlock_ctrl_dead_cleanup_one(nvme, info);
651 }
652 mutex_exit(&nvme->n_minor_mutex);
653 }
654
655 void
nvme_lock_fini(nvme_lock_t * lock)656 nvme_lock_fini(nvme_lock_t *lock)
657 {
658 VERIFY3P(lock->nl_writer, ==, NULL);
659 list_destroy(&lock->nl_pend_writers);
660 list_destroy(&lock->nl_pend_readers);
661 list_destroy(&lock->nl_readers);
662 }
663
664 void
nvme_lock_init(nvme_lock_t * lock)665 nvme_lock_init(nvme_lock_t *lock)
666 {
667 list_create(&lock->nl_readers, sizeof (nvme_minor_lock_info_t),
668 offsetof(nvme_minor_lock_info_t, nli_node));
669 list_create(&lock->nl_pend_readers, sizeof (nvme_minor_lock_info_t),
670 offsetof(nvme_minor_lock_info_t, nli_node));
671 list_create(&lock->nl_pend_writers, sizeof (nvme_minor_lock_info_t),
672 offsetof(nvme_minor_lock_info_t, nli_node));
673 }
674