1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Copyright 2018 Nexenta Systems, Inc.
28 */
29
30 #include <sys/systm.h>
31 #include <sys/cmn_err.h>
32 #include <sys/kmem.h>
33 #include <sys/disp.h>
34 #include <sys/id_space.h>
35 #include <sys/atomic.h>
36 #include <rpc/rpc.h>
37 #include <nfs/nfs4.h>
38 #include <nfs/nfs4_db_impl.h>
39 #include <sys/sdt.h>
40
41 static int rfs4_reap_interval = RFS4_REAP_INTERVAL;
42
43 static void rfs4_dbe_reap(rfs4_table_t *, time_t, uint32_t);
44 static void rfs4_dbe_destroy(rfs4_dbe_t *);
45 static rfs4_dbe_t *rfs4_dbe_create(rfs4_table_t *, id_t, rfs4_entry_t);
46 static void rfs4_start_reaper(rfs4_table_t *);
47
48 /*
49 * t_lowat - integer percentage of table entries /etc/system only
50 * t_hiwat - integer percentage of table entries /etc/system only
51 * t_lreap - integer percentage of table reap time mdb or /etc/system
52 * t_hreap - integer percentage of table reap time mdb or /etc/system
53 */
54 uint32_t t_lowat = 50; /* reap at t_lreap when id's in use hit 50% */
55 uint32_t t_hiwat = 75; /* reap at t_hreap when id's in use hit 75% */
56 time_t t_lreap = 50; /* default to 50% of table's reap interval */
57 time_t t_hreap = 10; /* default to 10% of table's reap interval */
58
59 id_t
rfs4_dbe_getid(rfs4_dbe_t * entry)60 rfs4_dbe_getid(rfs4_dbe_t *entry)
61 {
62 return (entry->dbe_id);
63 }
64
65 void
rfs4_dbe_hold(rfs4_dbe_t * entry)66 rfs4_dbe_hold(rfs4_dbe_t *entry)
67 {
68 atomic_inc_32(&entry->dbe_refcnt);
69 }
70
71 /*
72 * rfs4_dbe_rele_nolock only decrements the reference count of the entry.
73 */
74 void
rfs4_dbe_rele_nolock(rfs4_dbe_t * entry)75 rfs4_dbe_rele_nolock(rfs4_dbe_t *entry)
76 {
77 atomic_dec_32(&entry->dbe_refcnt);
78 }
79
80
81 uint32_t
rfs4_dbe_refcnt(rfs4_dbe_t * entry)82 rfs4_dbe_refcnt(rfs4_dbe_t *entry)
83 {
84 return (entry->dbe_refcnt);
85 }
86
87 /*
88 * Mark an entry such that the dbsearch will skip it.
89 * Caller does not want this entry to be found any longer
90 */
91 void
rfs4_dbe_invalidate(rfs4_dbe_t * entry)92 rfs4_dbe_invalidate(rfs4_dbe_t *entry)
93 {
94 entry->dbe_invalid = TRUE;
95 entry->dbe_skipsearch = TRUE;
96 }
97
98 /*
99 * Is this entry invalid?
100 */
101 bool_t
rfs4_dbe_is_invalid(rfs4_dbe_t * entry)102 rfs4_dbe_is_invalid(rfs4_dbe_t *entry)
103 {
104 return (entry->dbe_invalid);
105 }
106
107 time_t
rfs4_dbe_get_timerele(rfs4_dbe_t * entry)108 rfs4_dbe_get_timerele(rfs4_dbe_t *entry)
109 {
110 return (entry->dbe_time_rele);
111 }
112
113 /*
114 * Use these to temporarily hide/unhide a db entry.
115 */
116 void
rfs4_dbe_hide(rfs4_dbe_t * entry)117 rfs4_dbe_hide(rfs4_dbe_t *entry)
118 {
119 rfs4_dbe_lock(entry);
120 entry->dbe_skipsearch = TRUE;
121 rfs4_dbe_unlock(entry);
122 }
123
124 void
rfs4_dbe_unhide(rfs4_dbe_t * entry)125 rfs4_dbe_unhide(rfs4_dbe_t *entry)
126 {
127 rfs4_dbe_lock(entry);
128 entry->dbe_skipsearch = FALSE;
129 rfs4_dbe_unlock(entry);
130 }
131
132 void
rfs4_dbe_rele(rfs4_dbe_t * entry)133 rfs4_dbe_rele(rfs4_dbe_t *entry)
134 {
135 mutex_enter(entry->dbe_lock);
136 ASSERT(entry->dbe_refcnt > 1);
137 atomic_dec_32(&entry->dbe_refcnt);
138 entry->dbe_time_rele = gethrestime_sec();
139 mutex_exit(entry->dbe_lock);
140 }
141
142 void
rfs4_dbe_lock(rfs4_dbe_t * entry)143 rfs4_dbe_lock(rfs4_dbe_t *entry)
144 {
145 mutex_enter(entry->dbe_lock);
146 }
147
148 void
rfs4_dbe_unlock(rfs4_dbe_t * entry)149 rfs4_dbe_unlock(rfs4_dbe_t *entry)
150 {
151 mutex_exit(entry->dbe_lock);
152 }
153
154 bool_t
rfs4_dbe_islocked(rfs4_dbe_t * entry)155 rfs4_dbe_islocked(rfs4_dbe_t *entry)
156 {
157 return (mutex_owned(entry->dbe_lock));
158 }
159
160 clock_t
rfs4_dbe_twait(rfs4_dbe_t * entry,clock_t timeout)161 rfs4_dbe_twait(rfs4_dbe_t *entry, clock_t timeout)
162 {
163 return (cv_timedwait(entry->dbe_cv, entry->dbe_lock, timeout));
164 }
165
166 void
rfs4_dbe_cv_broadcast(rfs4_dbe_t * entry)167 rfs4_dbe_cv_broadcast(rfs4_dbe_t *entry)
168 {
169 cv_broadcast(entry->dbe_cv);
170 }
171
172 static int
rfs4_dbe_kmem_constructor(void * obj,void * private __unused,int kmflag __unused)173 rfs4_dbe_kmem_constructor(void *obj, void *private __unused,
174 int kmflag __unused)
175 {
176 rfs4_dbe_t *entry = obj;
177
178 mutex_init(entry->dbe_lock, NULL, MUTEX_DEFAULT, NULL);
179 cv_init(entry->dbe_cv, NULL, CV_DEFAULT, NULL);
180
181 return (0);
182 }
183
184 static void
rfs4_dbe_kmem_destructor(void * obj,void * private __unused)185 rfs4_dbe_kmem_destructor(void *obj, void *private __unused)
186 {
187 rfs4_dbe_t *entry = obj;
188
189 mutex_destroy(entry->dbe_lock);
190 cv_destroy(entry->dbe_cv);
191 }
192
193 rfs4_database_t *
rfs4_database_create(uint32_t flags)194 rfs4_database_create(uint32_t flags)
195 {
196 rfs4_database_t *db;
197
198 db = kmem_alloc(sizeof (rfs4_database_t), KM_SLEEP);
199 mutex_init(db->db_lock, NULL, MUTEX_DEFAULT, NULL);
200 db->db_tables = NULL;
201 db->db_debug_flags = flags;
202 db->db_shutdown_count = 0;
203 cv_init(&db->db_shutdown_wait, NULL, CV_DEFAULT, NULL);
204 return (db);
205 }
206
207
208 /*
209 * The reaper threads that have been created for the tables in this
210 * database must be stopped and the entries in the tables released.
211 * Each table will be marked as "shutdown" and the reaper threads
212 * poked and they will see that a shutdown is in progress and cleanup
213 * and exit. This function waits for all reaper threads to stop
214 * before returning to the caller.
215 */
216 void
rfs4_database_shutdown(rfs4_database_t * db)217 rfs4_database_shutdown(rfs4_database_t *db)
218 {
219 rfs4_table_t *table;
220
221 mutex_enter(db->db_lock);
222 for (table = db->db_tables; table; table = table->dbt_tnext) {
223 mutex_enter(&table->dbt_reaper_cv_lock);
224 table->dbt_reaper_shutdown = TRUE;
225 cv_broadcast(&table->dbt_reaper_wait);
226 db->db_shutdown_count++;
227 mutex_exit(&table->dbt_reaper_cv_lock);
228 }
229 while (db->db_shutdown_count > 0) {
230 cv_wait(&db->db_shutdown_wait, db->db_lock);
231 }
232 mutex_exit(db->db_lock);
233 }
234
235 /*
236 * Given a database that has been "shutdown" by the function above all
237 * of the table tables are destroyed and then the database itself
238 * freed.
239 */
240 void
rfs4_database_destroy(rfs4_database_t * db)241 rfs4_database_destroy(rfs4_database_t *db)
242 {
243 rfs4_table_t *next, *tmp;
244
245 for (next = db->db_tables; next; ) {
246 tmp = next;
247 next = tmp->dbt_tnext;
248 rfs4_table_destroy(db, tmp);
249 }
250
251 mutex_destroy(db->db_lock);
252 kmem_free(db, sizeof (rfs4_database_t));
253 }
254
255 /*
256 * Used to get the correct kmem_cache database for the state table being
257 * created.
258 * Helper function for rfs4_table_create
259 */
260 static kmem_cache_t *
get_db_mem_cache(char * name)261 get_db_mem_cache(char *name)
262 {
263 int i;
264
265 for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
266 if (strcmp(name, rfs4_db_mem_cache_table[i].r_db_name) == 0)
267 return (rfs4_db_mem_cache_table[i].r_db_mem_cache);
268 }
269 /*
270 * There is no associated kmem cache for this NFS4 server state
271 * table name
272 */
273 return (NULL);
274 }
275
276 /*
277 * Used to initialize the global NFSv4 server state database.
278 * Helper funtion for rfs4_state_g_init and called when module is loaded.
279 */
280 kmem_cache_t *
281 /* CSTYLED */
nfs4_init_mem_cache(char * cache_name,uint32_t idxcnt,uint32_t size,uint32_t idx)282 nfs4_init_mem_cache(char *cache_name, uint32_t idxcnt, uint32_t size, uint32_t idx)
283 {
284 kmem_cache_t *mem_cache = kmem_cache_create(cache_name,
285 sizeof (rfs4_dbe_t) + idxcnt * sizeof (rfs4_link_t) + size,
286 0,
287 rfs4_dbe_kmem_constructor,
288 rfs4_dbe_kmem_destructor,
289 NULL,
290 NULL,
291 NULL,
292 0);
293 (void) strlcpy(rfs4_db_mem_cache_table[idx].r_db_name, cache_name,
294 strlen(cache_name) + 1);
295 rfs4_db_mem_cache_table[idx].r_db_mem_cache = mem_cache;
296 return (mem_cache);
297 }
298
299 rfs4_table_t *
rfs4_table_create(rfs4_database_t * db,char * tabname,time_t max_cache_time,uint32_t idxcnt,bool_t (* create)(rfs4_entry_t,void *),void (* destroy)(rfs4_entry_t),bool_t (* expiry)(rfs4_entry_t),uint32_t size,uint32_t hashsize,uint32_t maxentries,id_t start)300 rfs4_table_create(rfs4_database_t *db, char *tabname, time_t max_cache_time,
301 uint32_t idxcnt, bool_t (*create)(rfs4_entry_t, void *),
302 void (*destroy)(rfs4_entry_t),
303 bool_t (*expiry)(rfs4_entry_t),
304 uint32_t size, uint32_t hashsize,
305 uint32_t maxentries, id_t start)
306 {
307 rfs4_table_t *table;
308 int len;
309 char *cache_name;
310 char *id_name;
311
312 table = kmem_alloc(sizeof (rfs4_table_t), KM_SLEEP);
313 table->dbt_db = db;
314 rw_init(table->dbt_t_lock, NULL, RW_DEFAULT, NULL);
315 mutex_init(table->dbt_lock, NULL, MUTEX_DEFAULT, NULL);
316 mutex_init(&table->dbt_reaper_cv_lock, NULL, MUTEX_DEFAULT, NULL);
317 cv_init(&table->dbt_reaper_wait, NULL, CV_DEFAULT, NULL);
318
319 len = strlen(tabname);
320 table->dbt_name = kmem_alloc(len+1, KM_SLEEP);
321 cache_name = kmem_alloc(len + 12 /* "_entry_cache" */ + 1, KM_SLEEP);
322 (void) strcpy(table->dbt_name, tabname);
323 (void) sprintf(cache_name, "%s_entry_cache", table->dbt_name);
324 table->dbt_max_cache_time = max_cache_time;
325 table->dbt_usize = size;
326 table->dbt_len = hashsize;
327 table->dbt_count = 0;
328 table->dbt_idxcnt = 0;
329 table->dbt_ccnt = 0;
330 table->dbt_maxcnt = idxcnt;
331 table->dbt_indices = NULL;
332 table->dbt_id_space = NULL;
333 table->dbt_reaper_shutdown = FALSE;
334
335 if (start >= 0) {
336 if (maxentries + (uint32_t)start > (uint32_t)INT32_MAX)
337 maxentries = INT32_MAX - start;
338 id_name = kmem_alloc(len + 9 /* "_id_space" */ + 1, KM_SLEEP);
339 (void) sprintf(id_name, "%s_id_space", table->dbt_name);
340 table->dbt_id_space = id_space_create(id_name, start,
341 maxentries + start);
342 kmem_free(id_name, len + 10);
343 }
344 ASSERT(t_lowat != 0);
345 table->dbt_id_lwat = (maxentries * t_lowat) / 100;
346 ASSERT(t_hiwat != 0);
347 table->dbt_id_hwat = (maxentries * t_hiwat) / 100;
348 table->dbt_id_reap = MIN(rfs4_reap_interval, max_cache_time);
349 table->dbt_maxentries = maxentries;
350 table->dbt_create = create;
351 table->dbt_destroy = destroy;
352 table->dbt_expiry = expiry;
353
354 /*
355 * get the correct kmem_cache for this table type based on the name.
356 */
357 table->dbt_mem_cache = get_db_mem_cache(cache_name);
358
359 kmem_free(cache_name, len+13);
360
361 table->dbt_debug = db->db_debug_flags;
362
363 mutex_enter(db->db_lock);
364 table->dbt_tnext = db->db_tables;
365 db->db_tables = table;
366 mutex_exit(db->db_lock);
367
368 rfs4_start_reaper(table);
369
370 return (table);
371 }
372
373 void
rfs4_table_destroy(rfs4_database_t * db,rfs4_table_t * table)374 rfs4_table_destroy(rfs4_database_t *db, rfs4_table_t *table)
375 {
376 rfs4_table_t *p;
377 rfs4_index_t *idx;
378
379 ASSERT(table->dbt_count == 0);
380
381 mutex_enter(db->db_lock);
382 if (table == db->db_tables)
383 db->db_tables = table->dbt_tnext;
384 else {
385 for (p = db->db_tables; p; p = p->dbt_tnext)
386 if (p->dbt_tnext == table) {
387 p->dbt_tnext = table->dbt_tnext;
388 table->dbt_tnext = NULL;
389 break;
390 }
391 ASSERT(p != NULL);
392 }
393 mutex_exit(db->db_lock);
394
395 /* Destroy indices */
396 while (table->dbt_indices) {
397 idx = table->dbt_indices;
398 table->dbt_indices = idx->dbi_inext;
399 rfs4_index_destroy(idx);
400 }
401
402 rw_destroy(table->dbt_t_lock);
403 mutex_destroy(table->dbt_lock);
404 mutex_destroy(&table->dbt_reaper_cv_lock);
405 cv_destroy(&table->dbt_reaper_wait);
406
407 kmem_free(table->dbt_name, strlen(table->dbt_name) + 1);
408 if (table->dbt_id_space)
409 id_space_destroy(table->dbt_id_space);
410 table->dbt_mem_cache = NULL;
411 kmem_free(table, sizeof (rfs4_table_t));
412 }
413
414 rfs4_index_t *
rfs4_index_create(rfs4_table_t * table,char * keyname,uint32_t (* hash)(void *),bool_t (compare)(rfs4_entry_t,void *),void * (* mkkey)(rfs4_entry_t),bool_t createable)415 rfs4_index_create(rfs4_table_t *table, char *keyname,
416 uint32_t (*hash)(void *),
417 bool_t (compare)(rfs4_entry_t, void *),
418 void *(*mkkey)(rfs4_entry_t),
419 bool_t createable)
420 {
421 rfs4_index_t *idx;
422
423 ASSERT(table->dbt_idxcnt < table->dbt_maxcnt);
424
425 idx = kmem_alloc(sizeof (rfs4_index_t), KM_SLEEP);
426
427 idx->dbi_table = table;
428 idx->dbi_keyname = kmem_alloc(strlen(keyname) + 1, KM_SLEEP);
429 (void) strcpy(idx->dbi_keyname, keyname);
430 idx->dbi_hash = hash;
431 idx->dbi_compare = compare;
432 idx->dbi_mkkey = mkkey;
433 idx->dbi_tblidx = table->dbt_idxcnt;
434 table->dbt_idxcnt++;
435 if (createable) {
436 table->dbt_ccnt++;
437 if (table->dbt_ccnt > 1)
438 panic("Table %s currently can have only have one "
439 "index that will allow creation of entries",
440 table->dbt_name);
441 idx->dbi_createable = TRUE;
442 } else {
443 idx->dbi_createable = FALSE;
444 }
445
446 idx->dbi_inext = table->dbt_indices;
447 table->dbt_indices = idx;
448 idx->dbi_buckets = kmem_zalloc(sizeof (rfs4_bucket_t) * table->dbt_len,
449 KM_SLEEP);
450
451 return (idx);
452 }
453
454 void
rfs4_index_destroy(rfs4_index_t * idx)455 rfs4_index_destroy(rfs4_index_t *idx)
456 {
457 kmem_free(idx->dbi_keyname, strlen(idx->dbi_keyname) + 1);
458 kmem_free(idx->dbi_buckets,
459 sizeof (rfs4_bucket_t) * idx->dbi_table->dbt_len);
460 kmem_free(idx, sizeof (rfs4_index_t));
461 }
462
463 static void
rfs4_dbe_destroy(rfs4_dbe_t * entry)464 rfs4_dbe_destroy(rfs4_dbe_t *entry)
465 {
466 rfs4_index_t *idx;
467 void *key;
468 int i;
469 rfs4_bucket_t *bp;
470 rfs4_table_t *table = entry->dbe_table;
471 rfs4_link_t *l;
472
473 NFS4_DEBUG(table->dbt_debug & DESTROY_DEBUG,
474 (CE_NOTE, "Destroying entry %p from %s",
475 (void*)entry, table->dbt_name));
476
477 mutex_enter(entry->dbe_lock);
478 ASSERT(entry->dbe_refcnt == 0);
479 mutex_exit(entry->dbe_lock);
480
481 /* Unlink from all indices */
482 for (idx = table->dbt_indices; idx; idx = idx->dbi_inext) {
483 l = &entry->dbe_indices[idx->dbi_tblidx];
484 /* check and see if we were ever linked in to the index */
485 if (INVALID_LINK(l)) {
486 ASSERT(l->next == NULL && l->prev == NULL);
487 continue;
488 }
489 key = idx->dbi_mkkey(entry->dbe_data);
490 i = HASH(idx, key);
491 bp = &idx->dbi_buckets[i];
492 ASSERT(bp->dbk_head != NULL);
493 DEQUEUE_IDX(bp, &entry->dbe_indices[idx->dbi_tblidx]);
494 }
495
496 /* Destroy user data */
497 if (table->dbt_destroy)
498 (*table->dbt_destroy)(entry->dbe_data);
499
500 if (table->dbt_id_space)
501 id_free(table->dbt_id_space, entry->dbe_id);
502
503 mutex_enter(table->dbt_lock);
504 table->dbt_count--;
505 mutex_exit(table->dbt_lock);
506
507 /* Destroy the entry itself */
508 kmem_cache_free(table->dbt_mem_cache, entry);
509 }
510
511
512 static rfs4_dbe_t *
rfs4_dbe_create(rfs4_table_t * table,id_t id,rfs4_entry_t data)513 rfs4_dbe_create(rfs4_table_t *table, id_t id, rfs4_entry_t data)
514 {
515 rfs4_dbe_t *entry;
516 int i;
517
518 NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
519 (CE_NOTE, "Creating entry in table %s", table->dbt_name));
520
521 entry = kmem_cache_alloc(table->dbt_mem_cache, KM_SLEEP);
522
523 entry->dbe_refcnt = 1;
524 entry->dbe_invalid = FALSE;
525 entry->dbe_skipsearch = FALSE;
526 entry->dbe_time_rele = 0;
527 entry->dbe_id = 0;
528
529 if (table->dbt_id_space)
530 entry->dbe_id = id;
531 entry->dbe_table = table;
532
533 for (i = 0; i < table->dbt_maxcnt; i++) {
534 entry->dbe_indices[i].next = entry->dbe_indices[i].prev = NULL;
535 entry->dbe_indices[i].entry = entry;
536 /*
537 * We mark the entry as not indexed by setting the low
538 * order bit, since address are word aligned. This has
539 * the advantage of causeing a trap if the address is
540 * used. After the entry is linked in to the
541 * corresponding index the bit will be cleared.
542 */
543 INVALIDATE_ADDR(entry->dbe_indices[i].entry);
544 }
545
546 entry->dbe_data = (rfs4_entry_t)&entry->dbe_indices[table->dbt_maxcnt];
547 bzero(entry->dbe_data, table->dbt_usize);
548 entry->dbe_data->dbe = entry;
549
550 if (!(*table->dbt_create)(entry->dbe_data, data)) {
551 kmem_cache_free(table->dbt_mem_cache, entry);
552 return (NULL);
553 }
554
555 mutex_enter(table->dbt_lock);
556 table->dbt_count++;
557 mutex_exit(table->dbt_lock);
558
559 return (entry);
560 }
561
562 static void
rfs4_dbe_tabreap_adjust(rfs4_table_t * table)563 rfs4_dbe_tabreap_adjust(rfs4_table_t *table)
564 {
565 clock_t tabreap;
566 clock_t reap_int;
567 uint32_t in_use;
568
569 /*
570 * Adjust the table's reap interval based on the
571 * number of id's currently in use. Each table's
572 * default remains the same if id usage subsides.
573 */
574 ASSERT(MUTEX_HELD(&table->dbt_reaper_cv_lock));
575 tabreap = MIN(rfs4_reap_interval, table->dbt_max_cache_time);
576
577 in_use = table->dbt_count + 1; /* see rfs4_dbe_create */
578 if (in_use >= table->dbt_id_hwat) {
579 ASSERT(t_hreap != 0);
580 reap_int = (tabreap * t_hreap) / 100;
581 } else if (in_use >= table->dbt_id_lwat) {
582 ASSERT(t_lreap != 0);
583 reap_int = (tabreap * t_lreap) / 100;
584 } else {
585 reap_int = tabreap;
586 }
587 table->dbt_id_reap = reap_int;
588 DTRACE_PROBE2(table__reap__interval, char *,
589 table->dbt_name, time_t, table->dbt_id_reap);
590 }
591
592 rfs4_entry_t
rfs4_dbsearch(rfs4_index_t * idx,void * key,bool_t * create,void * arg,rfs4_dbsearch_type_t dbsearch_type)593 rfs4_dbsearch(rfs4_index_t *idx, void *key, bool_t *create, void *arg,
594 rfs4_dbsearch_type_t dbsearch_type)
595 {
596 int already_done;
597 uint32_t i;
598 rfs4_table_t *table = idx->dbi_table;
599 rfs4_index_t *ip;
600 rfs4_bucket_t *bp;
601 rfs4_link_t *l;
602 rfs4_dbe_t *entry;
603 id_t id = -1;
604
605 i = HASH(idx, key);
606 bp = &idx->dbi_buckets[i];
607
608 NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
609 (CE_NOTE, "Searching for key %p in table %s by %s",
610 key, table->dbt_name, idx->dbi_keyname));
611
612 rw_enter(bp->dbk_lock, RW_READER);
613 retry:
614 for (l = bp->dbk_head; l; l = l->next) {
615 if (l->entry->dbe_refcnt > 0 &&
616 (l->entry->dbe_skipsearch == FALSE ||
617 (l->entry->dbe_skipsearch == TRUE &&
618 dbsearch_type == RFS4_DBS_INVALID)) &&
619 (*idx->dbi_compare)(l->entry->dbe_data, key)) {
620 mutex_enter(l->entry->dbe_lock);
621 if (l->entry->dbe_refcnt == 0) {
622 mutex_exit(l->entry->dbe_lock);
623 continue;
624 }
625
626 /* place an additional hold since we are returning */
627 rfs4_dbe_hold(l->entry);
628
629 mutex_exit(l->entry->dbe_lock);
630 rw_exit(bp->dbk_lock);
631
632 *create = FALSE;
633
634 NFS4_DEBUG((table->dbt_debug & SEARCH_DEBUG),
635 (CE_NOTE, "Found entry %p for %p in table %s",
636 (void *)l->entry, key, table->dbt_name));
637
638 if (id != -1)
639 id_free(table->dbt_id_space, id);
640 return (l->entry->dbe_data);
641 }
642 }
643
644 if (!*create || table->dbt_create == NULL || !idx->dbi_createable ||
645 table->dbt_maxentries == table->dbt_count) {
646 NFS4_DEBUG(table->dbt_debug & SEARCH_DEBUG,
647 (CE_NOTE, "Entry for %p in %s not found",
648 key, table->dbt_name));
649
650 rw_exit(bp->dbk_lock);
651 if (id != -1)
652 id_free(table->dbt_id_space, id);
653 return (NULL);
654 }
655
656 if (table->dbt_id_space && id == -1) {
657 rw_exit(bp->dbk_lock);
658
659 /* get an id, ok to sleep for it here */
660 id = id_alloc(table->dbt_id_space);
661 ASSERT(id != -1);
662
663 mutex_enter(&table->dbt_reaper_cv_lock);
664 rfs4_dbe_tabreap_adjust(table);
665 mutex_exit(&table->dbt_reaper_cv_lock);
666
667 rw_enter(bp->dbk_lock, RW_WRITER);
668 goto retry;
669 }
670
671 /* get an exclusive lock on the bucket */
672 if (rw_read_locked(bp->dbk_lock) && !rw_tryupgrade(bp->dbk_lock)) {
673 NFS4_DEBUG(table->dbt_debug & OTHER_DEBUG,
674 (CE_NOTE, "Trying to upgrade lock on "
675 "hash chain %d (%p) for %s by %s",
676 i, (void*)bp, table->dbt_name, idx->dbi_keyname));
677
678 rw_exit(bp->dbk_lock);
679 rw_enter(bp->dbk_lock, RW_WRITER);
680 goto retry;
681 }
682
683 /* create entry */
684 entry = rfs4_dbe_create(table, id, arg);
685 if (entry == NULL) {
686 rw_exit(bp->dbk_lock);
687 if (id != -1)
688 id_free(table->dbt_id_space, id);
689
690 NFS4_DEBUG(table->dbt_debug & CREATE_DEBUG,
691 (CE_NOTE, "Constructor for table %s failed",
692 table->dbt_name));
693 return (NULL);
694 }
695
696 /*
697 * Add one ref for entry into table's hash - only one
698 * reference added even though there may be multiple indices
699 */
700 rfs4_dbe_hold(entry);
701 ENQUEUE(bp->dbk_head, &entry->dbe_indices[idx->dbi_tblidx]);
702 VALIDATE_ADDR(entry->dbe_indices[idx->dbi_tblidx].entry);
703
704 already_done = idx->dbi_tblidx;
705 rw_exit(bp->dbk_lock);
706
707 for (ip = table->dbt_indices; ip; ip = ip->dbi_inext) {
708 if (ip->dbi_tblidx == already_done)
709 continue;
710 l = &entry->dbe_indices[ip->dbi_tblidx];
711 i = HASH(ip, ip->dbi_mkkey(entry->dbe_data));
712 ASSERT(i < ip->dbi_table->dbt_len);
713 bp = &ip->dbi_buckets[i];
714 ENQUEUE_IDX(bp, l);
715 }
716
717 NFS4_DEBUG(
718 table->dbt_debug & SEARCH_DEBUG || table->dbt_debug & CREATE_DEBUG,
719 (CE_NOTE, "Entry %p created for %s = %p in table %s",
720 (void*)entry, idx->dbi_keyname, (void*)key, table->dbt_name));
721
722 return (entry->dbe_data);
723 }
724
725 /*ARGSUSED*/
726 boolean_t
rfs4_cpr_callb(void * arg,int code)727 rfs4_cpr_callb(void *arg, int code)
728 {
729 rfs4_bucket_t *buckets, *bp;
730 rfs4_link_t *l;
731 rfs4_client_t *cp;
732 int i;
733
734 nfs4_srv_t *nsrv4 = nfs4_get_srv();
735 rfs4_table_t *table = nsrv4->rfs4_client_tab;
736
737 /*
738 * We get called for Suspend and Resume events.
739 * For the suspend case we simply don't care! Nor do we care if
740 * there are no clients.
741 */
742 if (code == CB_CODE_CPR_CHKPT || table == NULL) {
743 return (B_TRUE);
744 }
745
746 buckets = table->dbt_indices->dbi_buckets;
747
748 /*
749 * When we get this far we are in the process of
750 * resuming the system from a previous suspend.
751 *
752 * We are going to blast through and update the
753 * last_access time for all the clients and in
754 * doing so extend them by one lease period.
755 */
756 for (i = 0; i < table->dbt_len; i++) {
757 bp = &buckets[i];
758 for (l = bp->dbk_head; l; l = l->next) {
759 cp = (rfs4_client_t *)l->entry->dbe_data;
760 cp->rc_last_access = gethrestime_sec();
761 }
762 }
763
764 return (B_TRUE);
765 }
766
767 /*
768 * Given a table, lock each of the buckets and walk all entries (in
769 * turn locking those) and calling the provided "callout" function
770 * with the provided parameter. Obviously used to iterate across all
771 * entries in a particular table via the database locking hierarchy.
772 * Obviously the caller must not hold locks on any of the entries in
773 * the specified table.
774 */
775 void
rfs4_dbe_walk(rfs4_table_t * table,void (* callout)(rfs4_entry_t,void *),void * data)776 rfs4_dbe_walk(rfs4_table_t *table,
777 void (*callout)(rfs4_entry_t, void *),
778 void *data)
779 {
780 rfs4_bucket_t *buckets = table->dbt_indices->dbi_buckets, *bp;
781 rfs4_link_t *l;
782 rfs4_dbe_t *entry;
783 int i;
784
785 NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
786 (CE_NOTE, "Walking entries in %s", table->dbt_name));
787
788 /* Walk the buckets looking for entries to release/destroy */
789 for (i = 0; i < table->dbt_len; i++) {
790 bp = &buckets[i];
791 rw_enter(bp->dbk_lock, RW_READER);
792 for (l = bp->dbk_head; l; l = l->next) {
793 entry = l->entry;
794 mutex_enter(entry->dbe_lock);
795 (*callout)(entry->dbe_data, data);
796 mutex_exit(entry->dbe_lock);
797 }
798 rw_exit(bp->dbk_lock);
799 }
800
801 NFS4_DEBUG(table->dbt_debug & WALK_DEBUG,
802 (CE_NOTE, "Walking entries complete %s", table->dbt_name));
803 }
804
805
806 static void
rfs4_dbe_reap(rfs4_table_t * table,time_t cache_time,uint32_t desired)807 rfs4_dbe_reap(rfs4_table_t *table, time_t cache_time, uint32_t desired)
808 {
809 rfs4_index_t *idx = table->dbt_indices;
810 rfs4_bucket_t *buckets = idx->dbi_buckets, *bp;
811 rfs4_link_t *l, *t;
812 rfs4_dbe_t *entry;
813 bool_t found;
814 int i;
815 int count = 0;
816
817 NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
818 (CE_NOTE, "Reaping %d entries older than %ld seconds in table %s",
819 desired, cache_time, table->dbt_name));
820
821 /* Walk the buckets looking for entries to release/destroy */
822 for (i = 0; i < table->dbt_len; i++) {
823 bp = &buckets[i];
824 do {
825 found = FALSE;
826 rw_enter(bp->dbk_lock, RW_READER);
827 for (l = bp->dbk_head; l; l = l->next) {
828 entry = l->entry;
829 /*
830 * Examine an entry. Ref count of 1 means
831 * that the only reference is for the hash
832 * table reference.
833 */
834 if (entry->dbe_refcnt != 1)
835 continue;
836 mutex_enter(entry->dbe_lock);
837 if ((entry->dbe_refcnt == 1) &&
838 (table->dbt_reaper_shutdown ||
839 table->dbt_expiry == NULL ||
840 (*table->dbt_expiry)(entry->dbe_data))) {
841 entry->dbe_refcnt--;
842 count++;
843 found = TRUE;
844 }
845 mutex_exit(entry->dbe_lock);
846 }
847 if (found) {
848 if (!rw_tryupgrade(bp->dbk_lock)) {
849 rw_exit(bp->dbk_lock);
850 rw_enter(bp->dbk_lock, RW_WRITER);
851 }
852
853 l = bp->dbk_head;
854 while (l) {
855 t = l;
856 entry = t->entry;
857 l = l->next;
858 if (entry->dbe_refcnt == 0) {
859 DEQUEUE(bp->dbk_head, t);
860 t->next = NULL;
861 t->prev = NULL;
862 INVALIDATE_ADDR(t->entry);
863 rfs4_dbe_destroy(entry);
864 }
865 }
866 }
867 rw_exit(bp->dbk_lock);
868 /*
869 * delay slightly if there is more work to do
870 * with the expectation that other reaper
871 * threads are freeing data structures as well
872 * and in turn will reduce ref counts on
873 * entries in this table allowing them to be
874 * released. This is only done in the
875 * instance that the tables are being shut down.
876 */
877 if (table->dbt_reaper_shutdown && bp->dbk_head != NULL)
878 delay(hz/100);
879 /*
880 * If this is a table shutdown, keep going until
881 * everything is gone
882 */
883 } while (table->dbt_reaper_shutdown && bp->dbk_head != NULL);
884
885 if (!table->dbt_reaper_shutdown && desired && count >= desired)
886 break;
887 }
888
889 NFS4_DEBUG(table->dbt_debug & REAP_DEBUG,
890 (CE_NOTE, "Reaped %d entries older than %ld seconds in table %s",
891 count, cache_time, table->dbt_name));
892 }
893
894 static void
reaper_thread(caddr_t * arg)895 reaper_thread(caddr_t *arg)
896 {
897 rfs4_table_t *table = (rfs4_table_t *)arg;
898 clock_t rc;
899
900 NFS4_DEBUG(table->dbt_debug,
901 (CE_NOTE, "rfs4_reaper_thread starting for %s", table->dbt_name));
902
903 CALLB_CPR_INIT(&table->dbt_reaper_cpr_info, &table->dbt_reaper_cv_lock,
904 callb_generic_cpr, "nfsv4Reaper");
905
906 mutex_enter(&table->dbt_reaper_cv_lock);
907 do {
908 CALLB_CPR_SAFE_BEGIN(&table->dbt_reaper_cpr_info);
909 rc = cv_reltimedwait_sig(&table->dbt_reaper_wait,
910 &table->dbt_reaper_cv_lock,
911 SEC_TO_TICK(table->dbt_id_reap), TR_CLOCK_TICK);
912 CALLB_CPR_SAFE_END(&table->dbt_reaper_cpr_info,
913 &table->dbt_reaper_cv_lock);
914 rfs4_dbe_reap(table, table->dbt_max_cache_time, 0);
915 } while (rc != 0 && table->dbt_reaper_shutdown == FALSE);
916
917 CALLB_CPR_EXIT(&table->dbt_reaper_cpr_info);
918
919 NFS4_DEBUG(table->dbt_debug,
920 (CE_NOTE, "rfs4_reaper_thread exiting for %s", table->dbt_name));
921
922 /* Notify the database shutdown processing that the table is shutdown */
923 mutex_enter(table->dbt_db->db_lock);
924 table->dbt_db->db_shutdown_count--;
925 cv_signal(&table->dbt_db->db_shutdown_wait);
926 mutex_exit(table->dbt_db->db_lock);
927 zthread_exit();
928 }
929
930 static void
rfs4_start_reaper(rfs4_table_t * table)931 rfs4_start_reaper(rfs4_table_t *table)
932 {
933 if (table->dbt_max_cache_time == 0)
934 return;
935
936 (void) zthread_create(NULL, 0, reaper_thread, table, 0,
937 minclsyspri);
938 }
939
940 #ifdef DEBUG
941 void
rfs4_dbe_debug(rfs4_dbe_t * entry)942 rfs4_dbe_debug(rfs4_dbe_t *entry)
943 {
944 cmn_err(CE_NOTE, "Entry %p from table %s",
945 (void *)entry, entry->dbe_table->dbt_name);
946 cmn_err(CE_CONT, "\trefcnt = %d id = %d",
947 entry->dbe_refcnt, entry->dbe_id);
948 }
949 #endif
950