17e1e7763SThomas Graf /* 27e1e7763SThomas Graf * Resizable, Scalable, Concurrent Hash Table 37e1e7763SThomas Graf * 47e1e7763SThomas Graf * Copyright (c) 2014 Thomas Graf <tgraf@suug.ch> 57e1e7763SThomas Graf * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net> 67e1e7763SThomas Graf * 77e1e7763SThomas Graf * Based on the following paper: 87e1e7763SThomas Graf * https://www.usenix.org/legacy/event/atc11/tech/final_files/Triplett.pdf 97e1e7763SThomas Graf * 107e1e7763SThomas Graf * Code partially derived from nft_hash 117e1e7763SThomas Graf * 127e1e7763SThomas Graf * This program is free software; you can redistribute it and/or modify 137e1e7763SThomas Graf * it under the terms of the GNU General Public License version 2 as 147e1e7763SThomas Graf * published by the Free Software Foundation. 157e1e7763SThomas Graf */ 167e1e7763SThomas Graf 177e1e7763SThomas Graf #include <linux/kernel.h> 187e1e7763SThomas Graf #include <linux/init.h> 197e1e7763SThomas Graf #include <linux/log2.h> 207e1e7763SThomas Graf #include <linux/slab.h> 217e1e7763SThomas Graf #include <linux/vmalloc.h> 227e1e7763SThomas Graf #include <linux/mm.h> 2387545899SDaniel Borkmann #include <linux/jhash.h> 247e1e7763SThomas Graf #include <linux/random.h> 257e1e7763SThomas Graf #include <linux/rhashtable.h> 267e1e7763SThomas Graf 277e1e7763SThomas Graf #define HASH_DEFAULT_SIZE 64UL 287e1e7763SThomas Graf #define HASH_MIN_SIZE 4UL 2997defe1eSThomas Graf #define BUCKET_LOCKS_PER_CPU 128UL 3097defe1eSThomas Graf 31f89bd6f8SThomas Graf /* Base bits plus 1 bit for nulls marker */ 32f89bd6f8SThomas Graf #define HASH_RESERVED_SPACE (RHT_BASE_BITS + 1) 33f89bd6f8SThomas Graf 3497defe1eSThomas Graf enum { 3597defe1eSThomas Graf RHT_LOCK_NORMAL, 3697defe1eSThomas Graf RHT_LOCK_NESTED, 3797defe1eSThomas Graf RHT_LOCK_NESTED2, 3897defe1eSThomas Graf }; 3997defe1eSThomas Graf 4097defe1eSThomas Graf /* The bucket lock is selected based on the hash and protects mutations 4197defe1eSThomas Graf * on a group of hash buckets. 4297defe1eSThomas Graf * 4397defe1eSThomas Graf * IMPORTANT: When holding the bucket lock of both the old and new table 4497defe1eSThomas Graf * during expansions and shrinking, the old bucket lock must always be 4597defe1eSThomas Graf * acquired first. 4697defe1eSThomas Graf */ 4797defe1eSThomas Graf static spinlock_t *bucket_lock(const struct bucket_table *tbl, u32 hash) 4897defe1eSThomas Graf { 4997defe1eSThomas Graf return &tbl->locks[hash & tbl->locks_mask]; 5097defe1eSThomas Graf } 517e1e7763SThomas Graf 527e1e7763SThomas Graf #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT)) 5397defe1eSThomas Graf #define ASSERT_BUCKET_LOCK(TBL, HASH) \ 5497defe1eSThomas Graf BUG_ON(!lockdep_rht_bucket_is_held(TBL, HASH)) 557e1e7763SThomas Graf 567e1e7763SThomas Graf #ifdef CONFIG_PROVE_LOCKING 5797defe1eSThomas Graf int lockdep_rht_mutex_is_held(struct rhashtable *ht) 587e1e7763SThomas Graf { 5997defe1eSThomas Graf return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1; 607e1e7763SThomas Graf } 617e1e7763SThomas Graf EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held); 6288d6ed15SThomas Graf 6388d6ed15SThomas Graf int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash) 6488d6ed15SThomas Graf { 6597defe1eSThomas Graf spinlock_t *lock = bucket_lock(tbl, hash); 6697defe1eSThomas Graf 6797defe1eSThomas Graf return (debug_locks) ? lockdep_is_held(lock) : 1; 6888d6ed15SThomas Graf } 6988d6ed15SThomas Graf EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held); 707e1e7763SThomas Graf #endif 717e1e7763SThomas Graf 72c91eee56SThomas Graf static void *rht_obj(const struct rhashtable *ht, const struct rhash_head *he) 737e1e7763SThomas Graf { 747e1e7763SThomas Graf return (void *) he - ht->p.head_offset; 757e1e7763SThomas Graf } 767e1e7763SThomas Graf 778d24c0b4SThomas Graf static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash) 787e1e7763SThomas Graf { 798d24c0b4SThomas Graf return hash & (tbl->size - 1); 807e1e7763SThomas Graf } 817e1e7763SThomas Graf 828d24c0b4SThomas Graf static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr) 838d24c0b4SThomas Graf { 848d24c0b4SThomas Graf u32 hash; 858d24c0b4SThomas Graf 868d24c0b4SThomas Graf if (unlikely(!ht->p.key_len)) 878d24c0b4SThomas Graf hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd); 888d24c0b4SThomas Graf else 898d24c0b4SThomas Graf hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len, 908d24c0b4SThomas Graf ht->p.hash_rnd); 918d24c0b4SThomas Graf 92f89bd6f8SThomas Graf return hash >> HASH_RESERVED_SPACE; 938d24c0b4SThomas Graf } 948d24c0b4SThomas Graf 9597defe1eSThomas Graf static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len) 967e1e7763SThomas Graf { 977e1e7763SThomas Graf struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); 988d24c0b4SThomas Graf u32 hash; 997e1e7763SThomas Graf 1008d24c0b4SThomas Graf hash = ht->p.hashfn(key, len, ht->p.hash_rnd); 101f89bd6f8SThomas Graf hash >>= HASH_RESERVED_SPACE; 1028d24c0b4SThomas Graf 1038d24c0b4SThomas Graf return rht_bucket_index(tbl, hash); 1047e1e7763SThomas Graf } 1057e1e7763SThomas Graf 1067e1e7763SThomas Graf static u32 head_hashfn(const struct rhashtable *ht, 1078d24c0b4SThomas Graf const struct bucket_table *tbl, 1088d24c0b4SThomas Graf const struct rhash_head *he) 1097e1e7763SThomas Graf { 1108d24c0b4SThomas Graf return rht_bucket_index(tbl, obj_raw_hashfn(ht, rht_obj(ht, he))); 1117e1e7763SThomas Graf } 1127e1e7763SThomas Graf 113b8e1943eSThomas Graf static struct rhash_head __rcu **bucket_tail(struct bucket_table *tbl, u32 n) 114b8e1943eSThomas Graf { 115b8e1943eSThomas Graf struct rhash_head __rcu **pprev; 116b8e1943eSThomas Graf 117b8e1943eSThomas Graf for (pprev = &tbl->buckets[n]; 118f89bd6f8SThomas Graf !rht_is_a_nulls(rht_dereference_bucket(*pprev, tbl, n)); 119b8e1943eSThomas Graf pprev = &rht_dereference_bucket(*pprev, tbl, n)->next) 120b8e1943eSThomas Graf ; 121b8e1943eSThomas Graf 122b8e1943eSThomas Graf return pprev; 123b8e1943eSThomas Graf } 124b8e1943eSThomas Graf 12597defe1eSThomas Graf static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl) 12697defe1eSThomas Graf { 12797defe1eSThomas Graf unsigned int i, size; 12897defe1eSThomas Graf #if defined(CONFIG_PROVE_LOCKING) 12997defe1eSThomas Graf unsigned int nr_pcpus = 2; 13097defe1eSThomas Graf #else 13197defe1eSThomas Graf unsigned int nr_pcpus = num_possible_cpus(); 13297defe1eSThomas Graf #endif 13397defe1eSThomas Graf 13497defe1eSThomas Graf nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL); 13597defe1eSThomas Graf size = roundup_pow_of_two(nr_pcpus * ht->p.locks_mul); 13697defe1eSThomas Graf 13797defe1eSThomas Graf /* Never allocate more than one lock per bucket */ 13897defe1eSThomas Graf size = min_t(unsigned int, size, tbl->size); 13997defe1eSThomas Graf 14097defe1eSThomas Graf if (sizeof(spinlock_t) != 0) { 14197defe1eSThomas Graf #ifdef CONFIG_NUMA 14297defe1eSThomas Graf if (size * sizeof(spinlock_t) > PAGE_SIZE) 14397defe1eSThomas Graf tbl->locks = vmalloc(size * sizeof(spinlock_t)); 14497defe1eSThomas Graf else 14597defe1eSThomas Graf #endif 14697defe1eSThomas Graf tbl->locks = kmalloc_array(size, sizeof(spinlock_t), 14797defe1eSThomas Graf GFP_KERNEL); 14897defe1eSThomas Graf if (!tbl->locks) 14997defe1eSThomas Graf return -ENOMEM; 15097defe1eSThomas Graf for (i = 0; i < size; i++) 15197defe1eSThomas Graf spin_lock_init(&tbl->locks[i]); 15297defe1eSThomas Graf } 15397defe1eSThomas Graf tbl->locks_mask = size - 1; 15497defe1eSThomas Graf 15597defe1eSThomas Graf return 0; 15697defe1eSThomas Graf } 15797defe1eSThomas Graf 15897defe1eSThomas Graf static void bucket_table_free(const struct bucket_table *tbl) 15997defe1eSThomas Graf { 16097defe1eSThomas Graf if (tbl) 16197defe1eSThomas Graf kvfree(tbl->locks); 16297defe1eSThomas Graf 16397defe1eSThomas Graf kvfree(tbl); 16497defe1eSThomas Graf } 16597defe1eSThomas Graf 16697defe1eSThomas Graf static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, 16797defe1eSThomas Graf size_t nbuckets) 1687e1e7763SThomas Graf { 1697e1e7763SThomas Graf struct bucket_table *tbl; 1707e1e7763SThomas Graf size_t size; 171f89bd6f8SThomas Graf int i; 1727e1e7763SThomas Graf 1737e1e7763SThomas Graf size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]); 1746eba8224SThomas Graf tbl = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 1757e1e7763SThomas Graf if (tbl == NULL) 1767e1e7763SThomas Graf tbl = vzalloc(size); 1777e1e7763SThomas Graf 1787e1e7763SThomas Graf if (tbl == NULL) 1797e1e7763SThomas Graf return NULL; 1807e1e7763SThomas Graf 1817e1e7763SThomas Graf tbl->size = nbuckets; 1827e1e7763SThomas Graf 18397defe1eSThomas Graf if (alloc_bucket_locks(ht, tbl) < 0) { 18497defe1eSThomas Graf bucket_table_free(tbl); 18597defe1eSThomas Graf return NULL; 1867e1e7763SThomas Graf } 1877e1e7763SThomas Graf 188f89bd6f8SThomas Graf for (i = 0; i < nbuckets; i++) 189f89bd6f8SThomas Graf INIT_RHT_NULLS_HEAD(tbl->buckets[i], ht, i); 190f89bd6f8SThomas Graf 19197defe1eSThomas Graf return tbl; 1927e1e7763SThomas Graf } 1937e1e7763SThomas Graf 1947e1e7763SThomas Graf /** 1957e1e7763SThomas Graf * rht_grow_above_75 - returns true if nelems > 0.75 * table-size 1967e1e7763SThomas Graf * @ht: hash table 1977e1e7763SThomas Graf * @new_size: new table size 1987e1e7763SThomas Graf */ 1997e1e7763SThomas Graf bool rht_grow_above_75(const struct rhashtable *ht, size_t new_size) 2007e1e7763SThomas Graf { 2017e1e7763SThomas Graf /* Expand table when exceeding 75% load */ 202c0c09bfdSYing Xue return atomic_read(&ht->nelems) > (new_size / 4 * 3) && 203c0c09bfdSYing Xue (ht->p.max_shift && atomic_read(&ht->shift) < ht->p.max_shift); 2047e1e7763SThomas Graf } 2057e1e7763SThomas Graf EXPORT_SYMBOL_GPL(rht_grow_above_75); 2067e1e7763SThomas Graf 2077e1e7763SThomas Graf /** 2087e1e7763SThomas Graf * rht_shrink_below_30 - returns true if nelems < 0.3 * table-size 2097e1e7763SThomas Graf * @ht: hash table 2107e1e7763SThomas Graf * @new_size: new table size 2117e1e7763SThomas Graf */ 2127e1e7763SThomas Graf bool rht_shrink_below_30(const struct rhashtable *ht, size_t new_size) 2137e1e7763SThomas Graf { 2147e1e7763SThomas Graf /* Shrink table beneath 30% load */ 215c0c09bfdSYing Xue return atomic_read(&ht->nelems) < (new_size * 3 / 10) && 216c0c09bfdSYing Xue (atomic_read(&ht->shift) > ht->p.min_shift); 2177e1e7763SThomas Graf } 2187e1e7763SThomas Graf EXPORT_SYMBOL_GPL(rht_shrink_below_30); 2197e1e7763SThomas Graf 2207e1e7763SThomas Graf static void hashtable_chain_unzip(const struct rhashtable *ht, 2217e1e7763SThomas Graf const struct bucket_table *new_tbl, 22297defe1eSThomas Graf struct bucket_table *old_tbl, 22397defe1eSThomas Graf size_t old_hash) 2247e1e7763SThomas Graf { 2257e1e7763SThomas Graf struct rhash_head *he, *p, *next; 22697defe1eSThomas Graf spinlock_t *new_bucket_lock, *new_bucket_lock2 = NULL; 22797defe1eSThomas Graf unsigned int new_hash, new_hash2; 22897defe1eSThomas Graf 22997defe1eSThomas Graf ASSERT_BUCKET_LOCK(old_tbl, old_hash); 2307e1e7763SThomas Graf 2317e1e7763SThomas Graf /* Old bucket empty, no work needed. */ 23297defe1eSThomas Graf p = rht_dereference_bucket(old_tbl->buckets[old_hash], old_tbl, 23397defe1eSThomas Graf old_hash); 234f89bd6f8SThomas Graf if (rht_is_a_nulls(p)) 2357e1e7763SThomas Graf return; 2367e1e7763SThomas Graf 23797defe1eSThomas Graf new_hash = new_hash2 = head_hashfn(ht, new_tbl, p); 23897defe1eSThomas Graf new_bucket_lock = bucket_lock(new_tbl, new_hash); 23997defe1eSThomas Graf 2407e1e7763SThomas Graf /* Advance the old bucket pointer one or more times until it 2417e1e7763SThomas Graf * reaches a node that doesn't hash to the same bucket as the 2427e1e7763SThomas Graf * previous node p. Call the previous node p; 2437e1e7763SThomas Graf */ 24497defe1eSThomas Graf rht_for_each_continue(he, p->next, old_tbl, old_hash) { 24597defe1eSThomas Graf new_hash2 = head_hashfn(ht, new_tbl, he); 24697defe1eSThomas Graf if (new_hash != new_hash2) 2477e1e7763SThomas Graf break; 2487e1e7763SThomas Graf p = he; 2497e1e7763SThomas Graf } 25097defe1eSThomas Graf rcu_assign_pointer(old_tbl->buckets[old_hash], p->next); 25197defe1eSThomas Graf 25297defe1eSThomas Graf spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); 25397defe1eSThomas Graf 25497defe1eSThomas Graf /* If we have encountered an entry that maps to a different bucket in 25597defe1eSThomas Graf * the new table, lock down that bucket as well as we might cut off 25697defe1eSThomas Graf * the end of the chain. 25797defe1eSThomas Graf */ 25897defe1eSThomas Graf new_bucket_lock2 = bucket_lock(new_tbl, new_hash); 25997defe1eSThomas Graf if (new_bucket_lock != new_bucket_lock2) 26097defe1eSThomas Graf spin_lock_bh_nested(new_bucket_lock2, RHT_LOCK_NESTED2); 2617e1e7763SThomas Graf 2627e1e7763SThomas Graf /* Find the subsequent node which does hash to the same 2637e1e7763SThomas Graf * bucket as node P, or NULL if no such node exists. 2647e1e7763SThomas Graf */ 265f89bd6f8SThomas Graf INIT_RHT_NULLS_HEAD(next, ht, old_hash); 266f89bd6f8SThomas Graf if (!rht_is_a_nulls(he)) { 26797defe1eSThomas Graf rht_for_each_continue(he, he->next, old_tbl, old_hash) { 26897defe1eSThomas Graf if (head_hashfn(ht, new_tbl, he) == new_hash) { 2697e1e7763SThomas Graf next = he; 2707e1e7763SThomas Graf break; 2717e1e7763SThomas Graf } 2727e1e7763SThomas Graf } 2737e1e7763SThomas Graf } 2747e1e7763SThomas Graf 2757e1e7763SThomas Graf /* Set p's next pointer to that subsequent node pointer, 2767e1e7763SThomas Graf * bypassing the nodes which do not hash to p's bucket 2777e1e7763SThomas Graf */ 27897defe1eSThomas Graf rcu_assign_pointer(p->next, next); 27997defe1eSThomas Graf 28097defe1eSThomas Graf if (new_bucket_lock != new_bucket_lock2) 28197defe1eSThomas Graf spin_unlock_bh(new_bucket_lock2); 28297defe1eSThomas Graf spin_unlock_bh(new_bucket_lock); 28397defe1eSThomas Graf } 28497defe1eSThomas Graf 28597defe1eSThomas Graf static void link_old_to_new(struct bucket_table *new_tbl, 28697defe1eSThomas Graf unsigned int new_hash, struct rhash_head *entry) 28797defe1eSThomas Graf { 28897defe1eSThomas Graf spinlock_t *new_bucket_lock; 28997defe1eSThomas Graf 29097defe1eSThomas Graf new_bucket_lock = bucket_lock(new_tbl, new_hash); 29197defe1eSThomas Graf 29297defe1eSThomas Graf spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); 29397defe1eSThomas Graf rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), entry); 29497defe1eSThomas Graf spin_unlock_bh(new_bucket_lock); 2957e1e7763SThomas Graf } 2967e1e7763SThomas Graf 2977e1e7763SThomas Graf /** 2987e1e7763SThomas Graf * rhashtable_expand - Expand hash table while allowing concurrent lookups 2997e1e7763SThomas Graf * @ht: the hash table to expand 3007e1e7763SThomas Graf * 3017e1e7763SThomas Graf * A secondary bucket array is allocated and the hash entries are migrated 3027e1e7763SThomas Graf * while keeping them on both lists until the end of the RCU grace period. 3037e1e7763SThomas Graf * 3047e1e7763SThomas Graf * This function may only be called in a context where it is safe to call 3057e1e7763SThomas Graf * synchronize_rcu(), e.g. not within a rcu_read_lock() section. 3067e1e7763SThomas Graf * 30797defe1eSThomas Graf * The caller must ensure that no concurrent resizing occurs by holding 30897defe1eSThomas Graf * ht->mutex. 30997defe1eSThomas Graf * 31097defe1eSThomas Graf * It is valid to have concurrent insertions and deletions protected by per 31197defe1eSThomas Graf * bucket locks or concurrent RCU protected lookups and traversals. 3127e1e7763SThomas Graf */ 3136eba8224SThomas Graf int rhashtable_expand(struct rhashtable *ht) 3147e1e7763SThomas Graf { 3157e1e7763SThomas Graf struct bucket_table *new_tbl, *old_tbl = rht_dereference(ht->tbl, ht); 3167e1e7763SThomas Graf struct rhash_head *he; 31797defe1eSThomas Graf spinlock_t *old_bucket_lock; 31897defe1eSThomas Graf unsigned int new_hash, old_hash; 31997defe1eSThomas Graf bool complete = false; 3207e1e7763SThomas Graf 3217e1e7763SThomas Graf ASSERT_RHT_MUTEX(ht); 3227e1e7763SThomas Graf 32397defe1eSThomas Graf new_tbl = bucket_table_alloc(ht, old_tbl->size * 2); 3247e1e7763SThomas Graf if (new_tbl == NULL) 3257e1e7763SThomas Graf return -ENOMEM; 3267e1e7763SThomas Graf 327c0c09bfdSYing Xue atomic_inc(&ht->shift); 3287e1e7763SThomas Graf 32997defe1eSThomas Graf /* Make insertions go into the new, empty table right away. Deletions 33097defe1eSThomas Graf * and lookups will be attempted in both tables until we synchronize. 33197defe1eSThomas Graf * The synchronize_rcu() guarantees for the new table to be picked up 33297defe1eSThomas Graf * so no new additions go into the old table while we relink. 3337e1e7763SThomas Graf */ 33497defe1eSThomas Graf rcu_assign_pointer(ht->future_tbl, new_tbl); 33597defe1eSThomas Graf synchronize_rcu(); 33697defe1eSThomas Graf 33797defe1eSThomas Graf /* For each new bucket, search the corresponding old bucket for the 33897defe1eSThomas Graf * first entry that hashes to the new bucket, and link the end of 33997defe1eSThomas Graf * newly formed bucket chain (containing entries added to future 34097defe1eSThomas Graf * table) to that entry. Since all the entries which will end up in 34197defe1eSThomas Graf * the new bucket appear in the same old bucket, this constructs an 34297defe1eSThomas Graf * entirely valid new hash table, but with multiple buckets 34397defe1eSThomas Graf * "zipped" together into a single imprecise chain. 34497defe1eSThomas Graf */ 34597defe1eSThomas Graf for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { 34697defe1eSThomas Graf old_hash = rht_bucket_index(old_tbl, new_hash); 34797defe1eSThomas Graf old_bucket_lock = bucket_lock(old_tbl, old_hash); 34897defe1eSThomas Graf 34997defe1eSThomas Graf spin_lock_bh(old_bucket_lock); 35097defe1eSThomas Graf rht_for_each(he, old_tbl, old_hash) { 35197defe1eSThomas Graf if (head_hashfn(ht, new_tbl, he) == new_hash) { 35297defe1eSThomas Graf link_old_to_new(new_tbl, new_hash, he); 3537e1e7763SThomas Graf break; 3547e1e7763SThomas Graf } 3557e1e7763SThomas Graf } 35697defe1eSThomas Graf spin_unlock_bh(old_bucket_lock); 3577e1e7763SThomas Graf } 3587e1e7763SThomas Graf 3597e1e7763SThomas Graf /* Publish the new table pointer. Lookups may now traverse 3600c828f2fSHerbert Xu * the new table, but they will not benefit from any 3610c828f2fSHerbert Xu * additional efficiency until later steps unzip the buckets. 3627e1e7763SThomas Graf */ 3637e1e7763SThomas Graf rcu_assign_pointer(ht->tbl, new_tbl); 3647e1e7763SThomas Graf 3657e1e7763SThomas Graf /* Unzip interleaved hash chains */ 36697defe1eSThomas Graf while (!complete && !ht->being_destroyed) { 3677e1e7763SThomas Graf /* Wait for readers. All new readers will see the new 3687e1e7763SThomas Graf * table, and thus no references to the old table will 3697e1e7763SThomas Graf * remain. 3707e1e7763SThomas Graf */ 3717e1e7763SThomas Graf synchronize_rcu(); 3727e1e7763SThomas Graf 3737e1e7763SThomas Graf /* For each bucket in the old table (each of which 3747e1e7763SThomas Graf * contains items from multiple buckets of the new 3757e1e7763SThomas Graf * table): ... 3767e1e7763SThomas Graf */ 3777e1e7763SThomas Graf complete = true; 37897defe1eSThomas Graf for (old_hash = 0; old_hash < old_tbl->size; old_hash++) { 379f89bd6f8SThomas Graf struct rhash_head *head; 380f89bd6f8SThomas Graf 38197defe1eSThomas Graf old_bucket_lock = bucket_lock(old_tbl, old_hash); 38297defe1eSThomas Graf spin_lock_bh(old_bucket_lock); 38397defe1eSThomas Graf 38497defe1eSThomas Graf hashtable_chain_unzip(ht, new_tbl, old_tbl, old_hash); 385f89bd6f8SThomas Graf head = rht_dereference_bucket(old_tbl->buckets[old_hash], 386f89bd6f8SThomas Graf old_tbl, old_hash); 387f89bd6f8SThomas Graf if (!rht_is_a_nulls(head)) 3887e1e7763SThomas Graf complete = false; 38997defe1eSThomas Graf 39097defe1eSThomas Graf spin_unlock_bh(old_bucket_lock); 3917e1e7763SThomas Graf } 39297defe1eSThomas Graf } 3937e1e7763SThomas Graf 3947e1e7763SThomas Graf bucket_table_free(old_tbl); 3957e1e7763SThomas Graf return 0; 3967e1e7763SThomas Graf } 3977e1e7763SThomas Graf EXPORT_SYMBOL_GPL(rhashtable_expand); 3987e1e7763SThomas Graf 3997e1e7763SThomas Graf /** 4007e1e7763SThomas Graf * rhashtable_shrink - Shrink hash table while allowing concurrent lookups 4017e1e7763SThomas Graf * @ht: the hash table to shrink 4027e1e7763SThomas Graf * 4037e1e7763SThomas Graf * This function may only be called in a context where it is safe to call 4047e1e7763SThomas Graf * synchronize_rcu(), e.g. not within a rcu_read_lock() section. 4057e1e7763SThomas Graf * 40697defe1eSThomas Graf * The caller must ensure that no concurrent resizing occurs by holding 40797defe1eSThomas Graf * ht->mutex. 40897defe1eSThomas Graf * 4097e1e7763SThomas Graf * The caller must ensure that no concurrent table mutations take place. 4107e1e7763SThomas Graf * It is however valid to have concurrent lookups if they are RCU protected. 41197defe1eSThomas Graf * 41297defe1eSThomas Graf * It is valid to have concurrent insertions and deletions protected by per 41397defe1eSThomas Graf * bucket locks or concurrent RCU protected lookups and traversals. 4147e1e7763SThomas Graf */ 4156eba8224SThomas Graf int rhashtable_shrink(struct rhashtable *ht) 4167e1e7763SThomas Graf { 41797defe1eSThomas Graf struct bucket_table *new_tbl, *tbl = rht_dereference(ht->tbl, ht); 41897defe1eSThomas Graf spinlock_t *new_bucket_lock, *old_bucket_lock1, *old_bucket_lock2; 41997defe1eSThomas Graf unsigned int new_hash; 4207e1e7763SThomas Graf 4217e1e7763SThomas Graf ASSERT_RHT_MUTEX(ht); 4227e1e7763SThomas Graf 42397defe1eSThomas Graf new_tbl = bucket_table_alloc(ht, tbl->size / 2); 42497defe1eSThomas Graf if (new_tbl == NULL) 4257e1e7763SThomas Graf return -ENOMEM; 4267e1e7763SThomas Graf 42797defe1eSThomas Graf rcu_assign_pointer(ht->future_tbl, new_tbl); 42897defe1eSThomas Graf synchronize_rcu(); 4297e1e7763SThomas Graf 43097defe1eSThomas Graf /* Link the first entry in the old bucket to the end of the 43197defe1eSThomas Graf * bucket in the new table. As entries are concurrently being 43297defe1eSThomas Graf * added to the new table, lock down the new bucket. As we 43397defe1eSThomas Graf * always divide the size in half when shrinking, each bucket 43497defe1eSThomas Graf * in the new table maps to exactly two buckets in the old 43597defe1eSThomas Graf * table. 43697defe1eSThomas Graf * 43797defe1eSThomas Graf * As removals can occur concurrently on the old table, we need 43897defe1eSThomas Graf * to lock down both matching buckets in the old table. 4397e1e7763SThomas Graf */ 44097defe1eSThomas Graf for (new_hash = 0; new_hash < new_tbl->size; new_hash++) { 44197defe1eSThomas Graf old_bucket_lock1 = bucket_lock(tbl, new_hash); 44297defe1eSThomas Graf old_bucket_lock2 = bucket_lock(tbl, new_hash + new_tbl->size); 44397defe1eSThomas Graf new_bucket_lock = bucket_lock(new_tbl, new_hash); 4447e1e7763SThomas Graf 44597defe1eSThomas Graf spin_lock_bh(old_bucket_lock1); 44680ca8c3aSThomas Graf 44780ca8c3aSThomas Graf /* Depending on the lock per buckets mapping, the bucket in 44880ca8c3aSThomas Graf * the lower and upper region may map to the same lock. 44980ca8c3aSThomas Graf */ 45080ca8c3aSThomas Graf if (old_bucket_lock1 != old_bucket_lock2) { 45197defe1eSThomas Graf spin_lock_bh_nested(old_bucket_lock2, RHT_LOCK_NESTED); 45297defe1eSThomas Graf spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED2); 45380ca8c3aSThomas Graf } else { 45480ca8c3aSThomas Graf spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); 45580ca8c3aSThomas Graf } 45697defe1eSThomas Graf 45797defe1eSThomas Graf rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), 45897defe1eSThomas Graf tbl->buckets[new_hash]); 45997defe1eSThomas Graf rcu_assign_pointer(*bucket_tail(new_tbl, new_hash), 46097defe1eSThomas Graf tbl->buckets[new_hash + new_tbl->size]); 46197defe1eSThomas Graf 46297defe1eSThomas Graf spin_unlock_bh(new_bucket_lock); 46380ca8c3aSThomas Graf if (old_bucket_lock1 != old_bucket_lock2) 46497defe1eSThomas Graf spin_unlock_bh(old_bucket_lock2); 46597defe1eSThomas Graf spin_unlock_bh(old_bucket_lock1); 4667e1e7763SThomas Graf } 4677e1e7763SThomas Graf 4687e1e7763SThomas Graf /* Publish the new, valid hash table */ 46997defe1eSThomas Graf rcu_assign_pointer(ht->tbl, new_tbl); 470c0c09bfdSYing Xue atomic_dec(&ht->shift); 4717e1e7763SThomas Graf 4727e1e7763SThomas Graf /* Wait for readers. No new readers will have references to the 4737e1e7763SThomas Graf * old hash table. 4747e1e7763SThomas Graf */ 4757e1e7763SThomas Graf synchronize_rcu(); 4767e1e7763SThomas Graf 4777e1e7763SThomas Graf bucket_table_free(tbl); 4787e1e7763SThomas Graf 4797e1e7763SThomas Graf return 0; 4807e1e7763SThomas Graf } 4817e1e7763SThomas Graf EXPORT_SYMBOL_GPL(rhashtable_shrink); 4827e1e7763SThomas Graf 48397defe1eSThomas Graf static void rht_deferred_worker(struct work_struct *work) 48497defe1eSThomas Graf { 48597defe1eSThomas Graf struct rhashtable *ht; 48697defe1eSThomas Graf struct bucket_table *tbl; 487*f2dba9c6SHerbert Xu struct rhashtable_walker *walker; 48897defe1eSThomas Graf 48957699a40SYing Xue ht = container_of(work, struct rhashtable, run_work); 49097defe1eSThomas Graf mutex_lock(&ht->mutex); 49128134a53SHerbert Xu if (ht->being_destroyed) 49228134a53SHerbert Xu goto unlock; 49328134a53SHerbert Xu 49497defe1eSThomas Graf tbl = rht_dereference(ht->tbl, ht); 49597defe1eSThomas Graf 496*f2dba9c6SHerbert Xu list_for_each_entry(walker, &ht->walkers, list) 497*f2dba9c6SHerbert Xu walker->resize = true; 498*f2dba9c6SHerbert Xu 49997defe1eSThomas Graf if (ht->p.grow_decision && ht->p.grow_decision(ht, tbl->size)) 50097defe1eSThomas Graf rhashtable_expand(ht); 50197defe1eSThomas Graf else if (ht->p.shrink_decision && ht->p.shrink_decision(ht, tbl->size)) 50297defe1eSThomas Graf rhashtable_shrink(ht); 50397defe1eSThomas Graf 50428134a53SHerbert Xu unlock: 50597defe1eSThomas Graf mutex_unlock(&ht->mutex); 50697defe1eSThomas Graf } 50797defe1eSThomas Graf 50854c5b7d3SYing Xue static void rhashtable_wakeup_worker(struct rhashtable *ht) 50954c5b7d3SYing Xue { 51054c5b7d3SYing Xue struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); 51154c5b7d3SYing Xue struct bucket_table *new_tbl = rht_dereference_rcu(ht->future_tbl, ht); 51254c5b7d3SYing Xue size_t size = tbl->size; 51354c5b7d3SYing Xue 51454c5b7d3SYing Xue /* Only adjust the table if no resizing is currently in progress. */ 51554c5b7d3SYing Xue if (tbl == new_tbl && 51654c5b7d3SYing Xue ((ht->p.grow_decision && ht->p.grow_decision(ht, size)) || 51754c5b7d3SYing Xue (ht->p.shrink_decision && ht->p.shrink_decision(ht, size)))) 51857699a40SYing Xue schedule_work(&ht->run_work); 51954c5b7d3SYing Xue } 52054c5b7d3SYing Xue 521db304854SYing Xue static void __rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj, 522db304854SYing Xue struct bucket_table *tbl, u32 hash) 523db304854SYing Xue { 524db304854SYing Xue struct rhash_head *head = rht_dereference_bucket(tbl->buckets[hash], 525db304854SYing Xue tbl, hash); 526db304854SYing Xue 527db304854SYing Xue if (rht_is_a_nulls(head)) 528db304854SYing Xue INIT_RHT_NULLS_HEAD(obj->next, ht, hash); 529db304854SYing Xue else 530db304854SYing Xue RCU_INIT_POINTER(obj->next, head); 531db304854SYing Xue 532db304854SYing Xue rcu_assign_pointer(tbl->buckets[hash], obj); 533db304854SYing Xue 534db304854SYing Xue atomic_inc(&ht->nelems); 535db304854SYing Xue 536db304854SYing Xue rhashtable_wakeup_worker(ht); 537db304854SYing Xue } 538db304854SYing Xue 5397e1e7763SThomas Graf /** 540db304854SYing Xue * rhashtable_insert - insert object into hash table 5417e1e7763SThomas Graf * @ht: hash table 5427e1e7763SThomas Graf * @obj: pointer to hash head inside object 5437e1e7763SThomas Graf * 54497defe1eSThomas Graf * Will take a per bucket spinlock to protect against mutual mutations 54597defe1eSThomas Graf * on the same bucket. Multiple insertions may occur in parallel unless 54697defe1eSThomas Graf * they map to the same bucket lock. 5477e1e7763SThomas Graf * 54897defe1eSThomas Graf * It is safe to call this function from atomic context. 54997defe1eSThomas Graf * 55097defe1eSThomas Graf * Will trigger an automatic deferred table resizing if the size grows 55197defe1eSThomas Graf * beyond the watermark indicated by grow_decision() which can be passed 55297defe1eSThomas Graf * to rhashtable_init(). 5537e1e7763SThomas Graf */ 5546eba8224SThomas Graf void rhashtable_insert(struct rhashtable *ht, struct rhash_head *obj) 5557e1e7763SThomas Graf { 55697defe1eSThomas Graf struct bucket_table *tbl; 55797defe1eSThomas Graf spinlock_t *lock; 55897defe1eSThomas Graf unsigned hash; 5597e1e7763SThomas Graf 56097defe1eSThomas Graf rcu_read_lock(); 5617e1e7763SThomas Graf 56297defe1eSThomas Graf tbl = rht_dereference_rcu(ht->future_tbl, ht); 5638d24c0b4SThomas Graf hash = head_hashfn(ht, tbl, obj); 56497defe1eSThomas Graf lock = bucket_lock(tbl, hash); 56597defe1eSThomas Graf 56697defe1eSThomas Graf spin_lock_bh(lock); 567db304854SYing Xue __rhashtable_insert(ht, obj, tbl, hash); 56897defe1eSThomas Graf spin_unlock_bh(lock); 5697e1e7763SThomas Graf 57097defe1eSThomas Graf rcu_read_unlock(); 5717e1e7763SThomas Graf } 5727e1e7763SThomas Graf EXPORT_SYMBOL_GPL(rhashtable_insert); 5737e1e7763SThomas Graf 5747e1e7763SThomas Graf /** 5757e1e7763SThomas Graf * rhashtable_remove - remove object from hash table 5767e1e7763SThomas Graf * @ht: hash table 5777e1e7763SThomas Graf * @obj: pointer to hash head inside object 5787e1e7763SThomas Graf * 5797e1e7763SThomas Graf * Since the hash chain is single linked, the removal operation needs to 5807e1e7763SThomas Graf * walk the bucket chain upon removal. The removal operation is thus 5817e1e7763SThomas Graf * considerable slow if the hash table is not correctly sized. 5827e1e7763SThomas Graf * 583db304854SYing Xue * Will automatically shrink the table via rhashtable_expand() if the 5847e1e7763SThomas Graf * shrink_decision function specified at rhashtable_init() returns true. 5857e1e7763SThomas Graf * 5867e1e7763SThomas Graf * The caller must ensure that no concurrent table mutations occur. It is 5877e1e7763SThomas Graf * however valid to have concurrent lookups if they are RCU protected. 5887e1e7763SThomas Graf */ 5896eba8224SThomas Graf bool rhashtable_remove(struct rhashtable *ht, struct rhash_head *obj) 5907e1e7763SThomas Graf { 59197defe1eSThomas Graf struct bucket_table *tbl; 5927e1e7763SThomas Graf struct rhash_head __rcu **pprev; 5937e1e7763SThomas Graf struct rhash_head *he; 59497defe1eSThomas Graf spinlock_t *lock; 59597defe1eSThomas Graf unsigned int hash; 596fe6a043cSThomas Graf bool ret = false; 5977e1e7763SThomas Graf 59897defe1eSThomas Graf rcu_read_lock(); 59997defe1eSThomas Graf tbl = rht_dereference_rcu(ht->tbl, ht); 60097defe1eSThomas Graf hash = head_hashfn(ht, tbl, obj); 6017e1e7763SThomas Graf 60297defe1eSThomas Graf lock = bucket_lock(tbl, hash); 60397defe1eSThomas Graf spin_lock_bh(lock); 6047e1e7763SThomas Graf 60597defe1eSThomas Graf restart: 60697defe1eSThomas Graf pprev = &tbl->buckets[hash]; 60797defe1eSThomas Graf rht_for_each(he, tbl, hash) { 6087e1e7763SThomas Graf if (he != obj) { 6097e1e7763SThomas Graf pprev = &he->next; 6107e1e7763SThomas Graf continue; 6117e1e7763SThomas Graf } 6127e1e7763SThomas Graf 61397defe1eSThomas Graf rcu_assign_pointer(*pprev, obj->next); 614897362e4SThomas Graf 615fe6a043cSThomas Graf ret = true; 616fe6a043cSThomas Graf break; 6177e1e7763SThomas Graf } 6187e1e7763SThomas Graf 619fe6a043cSThomas Graf /* The entry may be linked in either 'tbl', 'future_tbl', or both. 620fe6a043cSThomas Graf * 'future_tbl' only exists for a short period of time during 621fe6a043cSThomas Graf * resizing. Thus traversing both is fine and the added cost is 622fe6a043cSThomas Graf * very rare. 623fe6a043cSThomas Graf */ 624bd6d4db5SYing Xue if (tbl != rht_dereference_rcu(ht->future_tbl, ht)) { 62597defe1eSThomas Graf spin_unlock_bh(lock); 62697defe1eSThomas Graf 627bd6d4db5SYing Xue tbl = rht_dereference_rcu(ht->future_tbl, ht); 62897defe1eSThomas Graf hash = head_hashfn(ht, tbl, obj); 62997defe1eSThomas Graf 63097defe1eSThomas Graf lock = bucket_lock(tbl, hash); 63197defe1eSThomas Graf spin_lock_bh(lock); 63297defe1eSThomas Graf goto restart; 63397defe1eSThomas Graf } 63497defe1eSThomas Graf 63597defe1eSThomas Graf spin_unlock_bh(lock); 636fe6a043cSThomas Graf 637fe6a043cSThomas Graf if (ret) { 638fe6a043cSThomas Graf atomic_dec(&ht->nelems); 639fe6a043cSThomas Graf rhashtable_wakeup_worker(ht); 640fe6a043cSThomas Graf } 641fe6a043cSThomas Graf 64297defe1eSThomas Graf rcu_read_unlock(); 64397defe1eSThomas Graf 644fe6a043cSThomas Graf return ret; 6457e1e7763SThomas Graf } 6467e1e7763SThomas Graf EXPORT_SYMBOL_GPL(rhashtable_remove); 6477e1e7763SThomas Graf 648efb975a6SYing Xue struct rhashtable_compare_arg { 649efb975a6SYing Xue struct rhashtable *ht; 650efb975a6SYing Xue const void *key; 651efb975a6SYing Xue }; 652efb975a6SYing Xue 653efb975a6SYing Xue static bool rhashtable_compare(void *ptr, void *arg) 654efb975a6SYing Xue { 655efb975a6SYing Xue struct rhashtable_compare_arg *x = arg; 656efb975a6SYing Xue struct rhashtable *ht = x->ht; 657efb975a6SYing Xue 658efb975a6SYing Xue return !memcmp(ptr + ht->p.key_offset, x->key, ht->p.key_len); 659efb975a6SYing Xue } 660efb975a6SYing Xue 6617e1e7763SThomas Graf /** 6627e1e7763SThomas Graf * rhashtable_lookup - lookup key in hash table 6637e1e7763SThomas Graf * @ht: hash table 6647e1e7763SThomas Graf * @key: pointer to key 6657e1e7763SThomas Graf * 6667e1e7763SThomas Graf * Computes the hash value for the key and traverses the bucket chain looking 6677e1e7763SThomas Graf * for a entry with an identical key. The first matching entry is returned. 6687e1e7763SThomas Graf * 6697e1e7763SThomas Graf * This lookup function may only be used for fixed key hash table (key_len 670db304854SYing Xue * parameter set). It will BUG() if used inappropriately. 6717e1e7763SThomas Graf * 67297defe1eSThomas Graf * Lookups may occur in parallel with hashtable mutations and resizing. 6737e1e7763SThomas Graf */ 67497defe1eSThomas Graf void *rhashtable_lookup(struct rhashtable *ht, const void *key) 6757e1e7763SThomas Graf { 676efb975a6SYing Xue struct rhashtable_compare_arg arg = { 677efb975a6SYing Xue .ht = ht, 678efb975a6SYing Xue .key = key, 679efb975a6SYing Xue }; 6807e1e7763SThomas Graf 6817e1e7763SThomas Graf BUG_ON(!ht->p.key_len); 6827e1e7763SThomas Graf 683efb975a6SYing Xue return rhashtable_lookup_compare(ht, key, &rhashtable_compare, &arg); 6847e1e7763SThomas Graf } 6857e1e7763SThomas Graf EXPORT_SYMBOL_GPL(rhashtable_lookup); 6867e1e7763SThomas Graf 6877e1e7763SThomas Graf /** 6887e1e7763SThomas Graf * rhashtable_lookup_compare - search hash table with compare function 6897e1e7763SThomas Graf * @ht: hash table 6908d24c0b4SThomas Graf * @key: the pointer to the key 6917e1e7763SThomas Graf * @compare: compare function, must return true on match 6927e1e7763SThomas Graf * @arg: argument passed on to compare function 6937e1e7763SThomas Graf * 6947e1e7763SThomas Graf * Traverses the bucket chain behind the provided hash value and calls the 6957e1e7763SThomas Graf * specified compare function for each entry. 6967e1e7763SThomas Graf * 69797defe1eSThomas Graf * Lookups may occur in parallel with hashtable mutations and resizing. 6987e1e7763SThomas Graf * 6997e1e7763SThomas Graf * Returns the first entry on which the compare function returned true. 7007e1e7763SThomas Graf */ 70197defe1eSThomas Graf void *rhashtable_lookup_compare(struct rhashtable *ht, const void *key, 7027e1e7763SThomas Graf bool (*compare)(void *, void *), void *arg) 7037e1e7763SThomas Graf { 70497defe1eSThomas Graf const struct bucket_table *tbl, *old_tbl; 7057e1e7763SThomas Graf struct rhash_head *he; 7068d24c0b4SThomas Graf u32 hash; 7077e1e7763SThomas Graf 70897defe1eSThomas Graf rcu_read_lock(); 70997defe1eSThomas Graf 71097defe1eSThomas Graf old_tbl = rht_dereference_rcu(ht->tbl, ht); 71197defe1eSThomas Graf tbl = rht_dereference_rcu(ht->future_tbl, ht); 7128d24c0b4SThomas Graf hash = key_hashfn(ht, key, ht->p.key_len); 71397defe1eSThomas Graf restart: 71497defe1eSThomas Graf rht_for_each_rcu(he, tbl, rht_bucket_index(tbl, hash)) { 7157e1e7763SThomas Graf if (!compare(rht_obj(ht, he), arg)) 7167e1e7763SThomas Graf continue; 71797defe1eSThomas Graf rcu_read_unlock(); 718a4b18cdaSThomas Graf return rht_obj(ht, he); 7197e1e7763SThomas Graf } 7207e1e7763SThomas Graf 72197defe1eSThomas Graf if (unlikely(tbl != old_tbl)) { 72297defe1eSThomas Graf tbl = old_tbl; 72397defe1eSThomas Graf goto restart; 72497defe1eSThomas Graf } 72597defe1eSThomas Graf rcu_read_unlock(); 72697defe1eSThomas Graf 7277e1e7763SThomas Graf return NULL; 7287e1e7763SThomas Graf } 7297e1e7763SThomas Graf EXPORT_SYMBOL_GPL(rhashtable_lookup_compare); 7307e1e7763SThomas Graf 731db304854SYing Xue /** 732db304854SYing Xue * rhashtable_lookup_insert - lookup and insert object into hash table 733db304854SYing Xue * @ht: hash table 734db304854SYing Xue * @obj: pointer to hash head inside object 735db304854SYing Xue * 736db304854SYing Xue * Locks down the bucket chain in both the old and new table if a resize 737db304854SYing Xue * is in progress to ensure that writers can't remove from the old table 738db304854SYing Xue * and can't insert to the new table during the atomic operation of search 739db304854SYing Xue * and insertion. Searches for duplicates in both the old and new table if 740db304854SYing Xue * a resize is in progress. 741db304854SYing Xue * 742db304854SYing Xue * This lookup function may only be used for fixed key hash table (key_len 743db304854SYing Xue * parameter set). It will BUG() if used inappropriately. 744db304854SYing Xue * 745db304854SYing Xue * It is safe to call this function from atomic context. 746db304854SYing Xue * 747db304854SYing Xue * Will trigger an automatic deferred table resizing if the size grows 748db304854SYing Xue * beyond the watermark indicated by grow_decision() which can be passed 749db304854SYing Xue * to rhashtable_init(). 750db304854SYing Xue */ 751db304854SYing Xue bool rhashtable_lookup_insert(struct rhashtable *ht, struct rhash_head *obj) 752db304854SYing Xue { 7537a868d1eSYing Xue struct rhashtable_compare_arg arg = { 7547a868d1eSYing Xue .ht = ht, 7557a868d1eSYing Xue .key = rht_obj(ht, obj) + ht->p.key_offset, 7567a868d1eSYing Xue }; 7577a868d1eSYing Xue 7587a868d1eSYing Xue BUG_ON(!ht->p.key_len); 7597a868d1eSYing Xue 7607a868d1eSYing Xue return rhashtable_lookup_compare_insert(ht, obj, &rhashtable_compare, 7617a868d1eSYing Xue &arg); 7627a868d1eSYing Xue } 7637a868d1eSYing Xue EXPORT_SYMBOL_GPL(rhashtable_lookup_insert); 7647a868d1eSYing Xue 7657a868d1eSYing Xue /** 7667a868d1eSYing Xue * rhashtable_lookup_compare_insert - search and insert object to hash table 7677a868d1eSYing Xue * with compare function 7687a868d1eSYing Xue * @ht: hash table 7697a868d1eSYing Xue * @obj: pointer to hash head inside object 7707a868d1eSYing Xue * @compare: compare function, must return true on match 7717a868d1eSYing Xue * @arg: argument passed on to compare function 7727a868d1eSYing Xue * 7737a868d1eSYing Xue * Locks down the bucket chain in both the old and new table if a resize 7747a868d1eSYing Xue * is in progress to ensure that writers can't remove from the old table 7757a868d1eSYing Xue * and can't insert to the new table during the atomic operation of search 7767a868d1eSYing Xue * and insertion. Searches for duplicates in both the old and new table if 7777a868d1eSYing Xue * a resize is in progress. 7787a868d1eSYing Xue * 7797a868d1eSYing Xue * Lookups may occur in parallel with hashtable mutations and resizing. 7807a868d1eSYing Xue * 7817a868d1eSYing Xue * Will trigger an automatic deferred table resizing if the size grows 7827a868d1eSYing Xue * beyond the watermark indicated by grow_decision() which can be passed 7837a868d1eSYing Xue * to rhashtable_init(). 7847a868d1eSYing Xue */ 7857a868d1eSYing Xue bool rhashtable_lookup_compare_insert(struct rhashtable *ht, 7867a868d1eSYing Xue struct rhash_head *obj, 7877a868d1eSYing Xue bool (*compare)(void *, void *), 7887a868d1eSYing Xue void *arg) 7897a868d1eSYing Xue { 790db304854SYing Xue struct bucket_table *new_tbl, *old_tbl; 791db304854SYing Xue spinlock_t *new_bucket_lock, *old_bucket_lock; 792db304854SYing Xue u32 new_hash, old_hash; 793db304854SYing Xue bool success = true; 794db304854SYing Xue 795db304854SYing Xue BUG_ON(!ht->p.key_len); 796db304854SYing Xue 797db304854SYing Xue rcu_read_lock(); 798db304854SYing Xue 799db304854SYing Xue old_tbl = rht_dereference_rcu(ht->tbl, ht); 800db304854SYing Xue old_hash = head_hashfn(ht, old_tbl, obj); 801db304854SYing Xue old_bucket_lock = bucket_lock(old_tbl, old_hash); 802db304854SYing Xue spin_lock_bh(old_bucket_lock); 803db304854SYing Xue 804db304854SYing Xue new_tbl = rht_dereference_rcu(ht->future_tbl, ht); 805db304854SYing Xue new_hash = head_hashfn(ht, new_tbl, obj); 806db304854SYing Xue new_bucket_lock = bucket_lock(new_tbl, new_hash); 807db304854SYing Xue if (unlikely(old_tbl != new_tbl)) 808db304854SYing Xue spin_lock_bh_nested(new_bucket_lock, RHT_LOCK_NESTED); 809db304854SYing Xue 8107a868d1eSYing Xue if (rhashtable_lookup_compare(ht, rht_obj(ht, obj) + ht->p.key_offset, 8117a868d1eSYing Xue compare, arg)) { 812db304854SYing Xue success = false; 813db304854SYing Xue goto exit; 814db304854SYing Xue } 815db304854SYing Xue 816db304854SYing Xue __rhashtable_insert(ht, obj, new_tbl, new_hash); 817db304854SYing Xue 818db304854SYing Xue exit: 819db304854SYing Xue if (unlikely(old_tbl != new_tbl)) 820db304854SYing Xue spin_unlock_bh(new_bucket_lock); 821db304854SYing Xue spin_unlock_bh(old_bucket_lock); 822db304854SYing Xue 823db304854SYing Xue rcu_read_unlock(); 824db304854SYing Xue 825db304854SYing Xue return success; 826db304854SYing Xue } 8277a868d1eSYing Xue EXPORT_SYMBOL_GPL(rhashtable_lookup_compare_insert); 828db304854SYing Xue 829*f2dba9c6SHerbert Xu /** 830*f2dba9c6SHerbert Xu * rhashtable_walk_init - Initialise an iterator 831*f2dba9c6SHerbert Xu * @ht: Table to walk over 832*f2dba9c6SHerbert Xu * @iter: Hash table Iterator 833*f2dba9c6SHerbert Xu * 834*f2dba9c6SHerbert Xu * This function prepares a hash table walk. 835*f2dba9c6SHerbert Xu * 836*f2dba9c6SHerbert Xu * Note that if you restart a walk after rhashtable_walk_stop you 837*f2dba9c6SHerbert Xu * may see the same object twice. Also, you may miss objects if 838*f2dba9c6SHerbert Xu * there are removals in between rhashtable_walk_stop and the next 839*f2dba9c6SHerbert Xu * call to rhashtable_walk_start. 840*f2dba9c6SHerbert Xu * 841*f2dba9c6SHerbert Xu * For a completely stable walk you should construct your own data 842*f2dba9c6SHerbert Xu * structure outside the hash table. 843*f2dba9c6SHerbert Xu * 844*f2dba9c6SHerbert Xu * This function may sleep so you must not call it from interrupt 845*f2dba9c6SHerbert Xu * context or with spin locks held. 846*f2dba9c6SHerbert Xu * 847*f2dba9c6SHerbert Xu * You must call rhashtable_walk_exit if this function returns 848*f2dba9c6SHerbert Xu * successfully. 849*f2dba9c6SHerbert Xu */ 850*f2dba9c6SHerbert Xu int rhashtable_walk_init(struct rhashtable *ht, struct rhashtable_iter *iter) 851*f2dba9c6SHerbert Xu { 852*f2dba9c6SHerbert Xu iter->ht = ht; 853*f2dba9c6SHerbert Xu iter->p = NULL; 854*f2dba9c6SHerbert Xu iter->slot = 0; 855*f2dba9c6SHerbert Xu iter->skip = 0; 856*f2dba9c6SHerbert Xu 857*f2dba9c6SHerbert Xu iter->walker = kmalloc(sizeof(*iter->walker), GFP_KERNEL); 858*f2dba9c6SHerbert Xu if (!iter->walker) 859*f2dba9c6SHerbert Xu return -ENOMEM; 860*f2dba9c6SHerbert Xu 861*f2dba9c6SHerbert Xu mutex_lock(&ht->mutex); 862*f2dba9c6SHerbert Xu list_add(&iter->walker->list, &ht->walkers); 863*f2dba9c6SHerbert Xu mutex_unlock(&ht->mutex); 864*f2dba9c6SHerbert Xu 865*f2dba9c6SHerbert Xu return 0; 866*f2dba9c6SHerbert Xu } 867*f2dba9c6SHerbert Xu EXPORT_SYMBOL_GPL(rhashtable_walk_init); 868*f2dba9c6SHerbert Xu 869*f2dba9c6SHerbert Xu /** 870*f2dba9c6SHerbert Xu * rhashtable_walk_exit - Free an iterator 871*f2dba9c6SHerbert Xu * @iter: Hash table Iterator 872*f2dba9c6SHerbert Xu * 873*f2dba9c6SHerbert Xu * This function frees resources allocated by rhashtable_walk_init. 874*f2dba9c6SHerbert Xu */ 875*f2dba9c6SHerbert Xu void rhashtable_walk_exit(struct rhashtable_iter *iter) 876*f2dba9c6SHerbert Xu { 877*f2dba9c6SHerbert Xu mutex_lock(&iter->ht->mutex); 878*f2dba9c6SHerbert Xu list_del(&iter->walker->list); 879*f2dba9c6SHerbert Xu mutex_unlock(&iter->ht->mutex); 880*f2dba9c6SHerbert Xu kfree(iter->walker); 881*f2dba9c6SHerbert Xu } 882*f2dba9c6SHerbert Xu EXPORT_SYMBOL_GPL(rhashtable_walk_exit); 883*f2dba9c6SHerbert Xu 884*f2dba9c6SHerbert Xu /** 885*f2dba9c6SHerbert Xu * rhashtable_walk_start - Start a hash table walk 886*f2dba9c6SHerbert Xu * @iter: Hash table iterator 887*f2dba9c6SHerbert Xu * 888*f2dba9c6SHerbert Xu * Start a hash table walk. Note that we take the RCU lock in all 889*f2dba9c6SHerbert Xu * cases including when we return an error. So you must always call 890*f2dba9c6SHerbert Xu * rhashtable_walk_stop to clean up. 891*f2dba9c6SHerbert Xu * 892*f2dba9c6SHerbert Xu * Returns zero if successful. 893*f2dba9c6SHerbert Xu * 894*f2dba9c6SHerbert Xu * Returns -EAGAIN if resize event occured. Note that the iterator 895*f2dba9c6SHerbert Xu * will rewind back to the beginning and you may use it immediately 896*f2dba9c6SHerbert Xu * by calling rhashtable_walk_next. 897*f2dba9c6SHerbert Xu */ 898*f2dba9c6SHerbert Xu int rhashtable_walk_start(struct rhashtable_iter *iter) 899*f2dba9c6SHerbert Xu { 900*f2dba9c6SHerbert Xu rcu_read_lock(); 901*f2dba9c6SHerbert Xu 902*f2dba9c6SHerbert Xu if (iter->walker->resize) { 903*f2dba9c6SHerbert Xu iter->slot = 0; 904*f2dba9c6SHerbert Xu iter->skip = 0; 905*f2dba9c6SHerbert Xu iter->walker->resize = false; 906*f2dba9c6SHerbert Xu return -EAGAIN; 907*f2dba9c6SHerbert Xu } 908*f2dba9c6SHerbert Xu 909*f2dba9c6SHerbert Xu return 0; 910*f2dba9c6SHerbert Xu } 911*f2dba9c6SHerbert Xu EXPORT_SYMBOL_GPL(rhashtable_walk_start); 912*f2dba9c6SHerbert Xu 913*f2dba9c6SHerbert Xu /** 914*f2dba9c6SHerbert Xu * rhashtable_walk_next - Return the next object and advance the iterator 915*f2dba9c6SHerbert Xu * @iter: Hash table iterator 916*f2dba9c6SHerbert Xu * 917*f2dba9c6SHerbert Xu * Note that you must call rhashtable_walk_stop when you are finished 918*f2dba9c6SHerbert Xu * with the walk. 919*f2dba9c6SHerbert Xu * 920*f2dba9c6SHerbert Xu * Returns the next object or NULL when the end of the table is reached. 921*f2dba9c6SHerbert Xu * 922*f2dba9c6SHerbert Xu * Returns -EAGAIN if resize event occured. Note that the iterator 923*f2dba9c6SHerbert Xu * will rewind back to the beginning and you may continue to use it. 924*f2dba9c6SHerbert Xu */ 925*f2dba9c6SHerbert Xu void *rhashtable_walk_next(struct rhashtable_iter *iter) 926*f2dba9c6SHerbert Xu { 927*f2dba9c6SHerbert Xu const struct bucket_table *tbl; 928*f2dba9c6SHerbert Xu struct rhashtable *ht = iter->ht; 929*f2dba9c6SHerbert Xu struct rhash_head *p = iter->p; 930*f2dba9c6SHerbert Xu void *obj = NULL; 931*f2dba9c6SHerbert Xu 932*f2dba9c6SHerbert Xu tbl = rht_dereference_rcu(ht->tbl, ht); 933*f2dba9c6SHerbert Xu 934*f2dba9c6SHerbert Xu if (p) { 935*f2dba9c6SHerbert Xu p = rht_dereference_bucket_rcu(p->next, tbl, iter->slot); 936*f2dba9c6SHerbert Xu goto next; 937*f2dba9c6SHerbert Xu } 938*f2dba9c6SHerbert Xu 939*f2dba9c6SHerbert Xu for (; iter->slot < tbl->size; iter->slot++) { 940*f2dba9c6SHerbert Xu int skip = iter->skip; 941*f2dba9c6SHerbert Xu 942*f2dba9c6SHerbert Xu rht_for_each_rcu(p, tbl, iter->slot) { 943*f2dba9c6SHerbert Xu if (!skip) 944*f2dba9c6SHerbert Xu break; 945*f2dba9c6SHerbert Xu skip--; 946*f2dba9c6SHerbert Xu } 947*f2dba9c6SHerbert Xu 948*f2dba9c6SHerbert Xu next: 949*f2dba9c6SHerbert Xu if (!rht_is_a_nulls(p)) { 950*f2dba9c6SHerbert Xu iter->skip++; 951*f2dba9c6SHerbert Xu iter->p = p; 952*f2dba9c6SHerbert Xu obj = rht_obj(ht, p); 953*f2dba9c6SHerbert Xu goto out; 954*f2dba9c6SHerbert Xu } 955*f2dba9c6SHerbert Xu 956*f2dba9c6SHerbert Xu iter->skip = 0; 957*f2dba9c6SHerbert Xu } 958*f2dba9c6SHerbert Xu 959*f2dba9c6SHerbert Xu iter->p = NULL; 960*f2dba9c6SHerbert Xu 961*f2dba9c6SHerbert Xu out: 962*f2dba9c6SHerbert Xu if (iter->walker->resize) { 963*f2dba9c6SHerbert Xu iter->p = NULL; 964*f2dba9c6SHerbert Xu iter->slot = 0; 965*f2dba9c6SHerbert Xu iter->skip = 0; 966*f2dba9c6SHerbert Xu iter->walker->resize = false; 967*f2dba9c6SHerbert Xu return ERR_PTR(-EAGAIN); 968*f2dba9c6SHerbert Xu } 969*f2dba9c6SHerbert Xu 970*f2dba9c6SHerbert Xu return obj; 971*f2dba9c6SHerbert Xu } 972*f2dba9c6SHerbert Xu EXPORT_SYMBOL_GPL(rhashtable_walk_next); 973*f2dba9c6SHerbert Xu 974*f2dba9c6SHerbert Xu /** 975*f2dba9c6SHerbert Xu * rhashtable_walk_stop - Finish a hash table walk 976*f2dba9c6SHerbert Xu * @iter: Hash table iterator 977*f2dba9c6SHerbert Xu * 978*f2dba9c6SHerbert Xu * Finish a hash table walk. 979*f2dba9c6SHerbert Xu */ 980*f2dba9c6SHerbert Xu void rhashtable_walk_stop(struct rhashtable_iter *iter) 981*f2dba9c6SHerbert Xu { 982*f2dba9c6SHerbert Xu rcu_read_unlock(); 983*f2dba9c6SHerbert Xu iter->p = NULL; 984*f2dba9c6SHerbert Xu } 985*f2dba9c6SHerbert Xu EXPORT_SYMBOL_GPL(rhashtable_walk_stop); 986*f2dba9c6SHerbert Xu 98794000176SYing Xue static size_t rounded_hashtable_size(struct rhashtable_params *params) 9887e1e7763SThomas Graf { 98994000176SYing Xue return max(roundup_pow_of_two(params->nelem_hint * 4 / 3), 99094000176SYing Xue 1UL << params->min_shift); 9917e1e7763SThomas Graf } 9927e1e7763SThomas Graf 9937e1e7763SThomas Graf /** 9947e1e7763SThomas Graf * rhashtable_init - initialize a new hash table 9957e1e7763SThomas Graf * @ht: hash table to be initialized 9967e1e7763SThomas Graf * @params: configuration parameters 9977e1e7763SThomas Graf * 9987e1e7763SThomas Graf * Initializes a new hash table based on the provided configuration 9997e1e7763SThomas Graf * parameters. A table can be configured either with a variable or 10007e1e7763SThomas Graf * fixed length key: 10017e1e7763SThomas Graf * 10027e1e7763SThomas Graf * Configuration Example 1: Fixed length keys 10037e1e7763SThomas Graf * struct test_obj { 10047e1e7763SThomas Graf * int key; 10057e1e7763SThomas Graf * void * my_member; 10067e1e7763SThomas Graf * struct rhash_head node; 10077e1e7763SThomas Graf * }; 10087e1e7763SThomas Graf * 10097e1e7763SThomas Graf * struct rhashtable_params params = { 10107e1e7763SThomas Graf * .head_offset = offsetof(struct test_obj, node), 10117e1e7763SThomas Graf * .key_offset = offsetof(struct test_obj, key), 10127e1e7763SThomas Graf * .key_len = sizeof(int), 101387545899SDaniel Borkmann * .hashfn = jhash, 1014f89bd6f8SThomas Graf * .nulls_base = (1U << RHT_BASE_SHIFT), 10157e1e7763SThomas Graf * }; 10167e1e7763SThomas Graf * 10177e1e7763SThomas Graf * Configuration Example 2: Variable length keys 10187e1e7763SThomas Graf * struct test_obj { 10197e1e7763SThomas Graf * [...] 10207e1e7763SThomas Graf * struct rhash_head node; 10217e1e7763SThomas Graf * }; 10227e1e7763SThomas Graf * 10237e1e7763SThomas Graf * u32 my_hash_fn(const void *data, u32 seed) 10247e1e7763SThomas Graf * { 10257e1e7763SThomas Graf * struct test_obj *obj = data; 10267e1e7763SThomas Graf * 10277e1e7763SThomas Graf * return [... hash ...]; 10287e1e7763SThomas Graf * } 10297e1e7763SThomas Graf * 10307e1e7763SThomas Graf * struct rhashtable_params params = { 10317e1e7763SThomas Graf * .head_offset = offsetof(struct test_obj, node), 103287545899SDaniel Borkmann * .hashfn = jhash, 10337e1e7763SThomas Graf * .obj_hashfn = my_hash_fn, 10347e1e7763SThomas Graf * }; 10357e1e7763SThomas Graf */ 10367e1e7763SThomas Graf int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params) 10377e1e7763SThomas Graf { 10387e1e7763SThomas Graf struct bucket_table *tbl; 10397e1e7763SThomas Graf size_t size; 10407e1e7763SThomas Graf 10417e1e7763SThomas Graf size = HASH_DEFAULT_SIZE; 10427e1e7763SThomas Graf 10437e1e7763SThomas Graf if ((params->key_len && !params->hashfn) || 10447e1e7763SThomas Graf (!params->key_len && !params->obj_hashfn)) 10457e1e7763SThomas Graf return -EINVAL; 10467e1e7763SThomas Graf 1047f89bd6f8SThomas Graf if (params->nulls_base && params->nulls_base < (1U << RHT_BASE_SHIFT)) 1048f89bd6f8SThomas Graf return -EINVAL; 1049f89bd6f8SThomas Graf 105094000176SYing Xue params->min_shift = max_t(size_t, params->min_shift, 105194000176SYing Xue ilog2(HASH_MIN_SIZE)); 105294000176SYing Xue 10537e1e7763SThomas Graf if (params->nelem_hint) 105494000176SYing Xue size = rounded_hashtable_size(params); 10557e1e7763SThomas Graf 105697defe1eSThomas Graf memset(ht, 0, sizeof(*ht)); 105797defe1eSThomas Graf mutex_init(&ht->mutex); 105897defe1eSThomas Graf memcpy(&ht->p, params, sizeof(*params)); 1059*f2dba9c6SHerbert Xu INIT_LIST_HEAD(&ht->walkers); 106097defe1eSThomas Graf 106197defe1eSThomas Graf if (params->locks_mul) 106297defe1eSThomas Graf ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); 106397defe1eSThomas Graf else 106497defe1eSThomas Graf ht->p.locks_mul = BUCKET_LOCKS_PER_CPU; 106597defe1eSThomas Graf 106697defe1eSThomas Graf tbl = bucket_table_alloc(ht, size); 10677e1e7763SThomas Graf if (tbl == NULL) 10687e1e7763SThomas Graf return -ENOMEM; 10697e1e7763SThomas Graf 1070545a148eSYing Xue atomic_set(&ht->nelems, 0); 1071c0c09bfdSYing Xue atomic_set(&ht->shift, ilog2(tbl->size)); 10727e1e7763SThomas Graf RCU_INIT_POINTER(ht->tbl, tbl); 107397defe1eSThomas Graf RCU_INIT_POINTER(ht->future_tbl, tbl); 10747e1e7763SThomas Graf 10757e1e7763SThomas Graf if (!ht->p.hash_rnd) 10767e1e7763SThomas Graf get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd)); 10777e1e7763SThomas Graf 107897defe1eSThomas Graf if (ht->p.grow_decision || ht->p.shrink_decision) 107957699a40SYing Xue INIT_WORK(&ht->run_work, rht_deferred_worker); 108097defe1eSThomas Graf 10817e1e7763SThomas Graf return 0; 10827e1e7763SThomas Graf } 10837e1e7763SThomas Graf EXPORT_SYMBOL_GPL(rhashtable_init); 10847e1e7763SThomas Graf 10857e1e7763SThomas Graf /** 10867e1e7763SThomas Graf * rhashtable_destroy - destroy hash table 10877e1e7763SThomas Graf * @ht: the hash table to destroy 10887e1e7763SThomas Graf * 1089ae82ddcfSPablo Neira Ayuso * Frees the bucket array. This function is not rcu safe, therefore the caller 1090ae82ddcfSPablo Neira Ayuso * has to make sure that no resizing may happen by unpublishing the hashtable 1091ae82ddcfSPablo Neira Ayuso * and waiting for the quiescent cycle before releasing the bucket array. 10927e1e7763SThomas Graf */ 109397defe1eSThomas Graf void rhashtable_destroy(struct rhashtable *ht) 10947e1e7763SThomas Graf { 109597defe1eSThomas Graf ht->being_destroyed = true; 109697defe1eSThomas Graf 109757699a40SYing Xue if (ht->p.grow_decision || ht->p.shrink_decision) 109857699a40SYing Xue cancel_work_sync(&ht->run_work); 109957699a40SYing Xue 110097defe1eSThomas Graf mutex_lock(&ht->mutex); 110197defe1eSThomas Graf bucket_table_free(rht_dereference(ht->tbl, ht)); 110297defe1eSThomas Graf mutex_unlock(&ht->mutex); 11037e1e7763SThomas Graf } 11047e1e7763SThomas Graf EXPORT_SYMBOL_GPL(rhashtable_destroy); 1105