xref: /linux/net/openvswitch/flow_table.c (revision 0ea5c948cb64bab5bc7a5516774eb8536f05aa0d)
1  // SPDX-License-Identifier: GPL-2.0-only
2  /*
3   * Copyright (c) 2007-2014 Nicira, Inc.
4   */
5  
6  #include "flow.h"
7  #include "datapath.h"
8  #include "flow_netlink.h"
9  #include <linux/uaccess.h>
10  #include <linux/netdevice.h>
11  #include <linux/etherdevice.h>
12  #include <linux/if_ether.h>
13  #include <linux/if_vlan.h>
14  #include <net/llc_pdu.h>
15  #include <linux/kernel.h>
16  #include <linux/jhash.h>
17  #include <linux/jiffies.h>
18  #include <linux/llc.h>
19  #include <linux/module.h>
20  #include <linux/in.h>
21  #include <linux/rcupdate.h>
22  #include <linux/cpumask.h>
23  #include <linux/if_arp.h>
24  #include <linux/ip.h>
25  #include <linux/ipv6.h>
26  #include <linux/sctp.h>
27  #include <linux/tcp.h>
28  #include <linux/udp.h>
29  #include <linux/icmp.h>
30  #include <linux/icmpv6.h>
31  #include <linux/rculist.h>
32  #include <linux/sort.h>
33  #include <net/ip.h>
34  #include <net/ipv6.h>
35  #include <net/ndisc.h>
36  
37  #define TBL_MIN_BUCKETS		1024
38  #define MASK_ARRAY_SIZE_MIN	16
39  #define REHASH_INTERVAL		(10 * 60 * HZ)
40  
41  #define MC_DEFAULT_HASH_ENTRIES	256
42  #define MC_HASH_SHIFT		8
43  #define MC_HASH_SEGS		((sizeof(uint32_t) * 8) / MC_HASH_SHIFT)
44  
45  static struct kmem_cache *flow_cache;
46  struct kmem_cache *flow_stats_cache __read_mostly;
47  
range_n_bytes(const struct sw_flow_key_range * range)48  static u16 range_n_bytes(const struct sw_flow_key_range *range)
49  {
50  	return range->end - range->start;
51  }
52  
ovs_flow_mask_key(struct sw_flow_key * dst,const struct sw_flow_key * src,bool full,const struct sw_flow_mask * mask)53  void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
54  		       bool full, const struct sw_flow_mask *mask)
55  {
56  	int start = full ? 0 : mask->range.start;
57  	int len = full ? sizeof *dst : range_n_bytes(&mask->range);
58  	const long *m = (const long *)((const u8 *)&mask->key + start);
59  	const long *s = (const long *)((const u8 *)src + start);
60  	long *d = (long *)((u8 *)dst + start);
61  	int i;
62  
63  	/* If 'full' is true then all of 'dst' is fully initialized. Otherwise,
64  	 * if 'full' is false the memory outside of the 'mask->range' is left
65  	 * uninitialized. This can be used as an optimization when further
66  	 * operations on 'dst' only use contents within 'mask->range'.
67  	 */
68  	for (i = 0; i < len; i += sizeof(long))
69  		*d++ = *s++ & *m++;
70  }
71  
ovs_flow_alloc(void)72  struct sw_flow *ovs_flow_alloc(void)
73  {
74  	struct sw_flow *flow;
75  	struct sw_flow_stats *stats;
76  
77  	flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL);
78  	if (!flow)
79  		return ERR_PTR(-ENOMEM);
80  
81  	flow->stats_last_writer = -1;
82  	flow->cpu_used_mask = (struct cpumask *)&flow->stats[nr_cpu_ids];
83  
84  	/* Initialize the default stat node. */
85  	stats = kmem_cache_alloc_node(flow_stats_cache,
86  				      GFP_KERNEL | __GFP_ZERO,
87  				      node_online(0) ? 0 : NUMA_NO_NODE);
88  	if (!stats)
89  		goto err;
90  
91  	spin_lock_init(&stats->lock);
92  
93  	RCU_INIT_POINTER(flow->stats[0], stats);
94  
95  	cpumask_set_cpu(0, flow->cpu_used_mask);
96  
97  	return flow;
98  err:
99  	kmem_cache_free(flow_cache, flow);
100  	return ERR_PTR(-ENOMEM);
101  }
102  
ovs_flow_tbl_count(const struct flow_table * table)103  int ovs_flow_tbl_count(const struct flow_table *table)
104  {
105  	return table->count;
106  }
107  
flow_free(struct sw_flow * flow)108  static void flow_free(struct sw_flow *flow)
109  {
110  	int cpu;
111  
112  	if (ovs_identifier_is_key(&flow->id))
113  		kfree(flow->id.unmasked_key);
114  	if (flow->sf_acts)
115  		ovs_nla_free_flow_actions((struct sw_flow_actions __force *)
116  					  flow->sf_acts);
117  	/* We open code this to make sure cpu 0 is always considered */
118  	for (cpu = 0; cpu < nr_cpu_ids;
119  	     cpu = cpumask_next(cpu, flow->cpu_used_mask)) {
120  		if (flow->stats[cpu])
121  			kmem_cache_free(flow_stats_cache,
122  					(struct sw_flow_stats __force *)flow->stats[cpu]);
123  	}
124  
125  	kmem_cache_free(flow_cache, flow);
126  }
127  
rcu_free_flow_callback(struct rcu_head * rcu)128  static void rcu_free_flow_callback(struct rcu_head *rcu)
129  {
130  	struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);
131  
132  	flow_free(flow);
133  }
134  
ovs_flow_free(struct sw_flow * flow,bool deferred)135  void ovs_flow_free(struct sw_flow *flow, bool deferred)
136  {
137  	if (!flow)
138  		return;
139  
140  	if (deferred)
141  		call_rcu(&flow->rcu, rcu_free_flow_callback);
142  	else
143  		flow_free(flow);
144  }
145  
__table_instance_destroy(struct table_instance * ti)146  static void __table_instance_destroy(struct table_instance *ti)
147  {
148  	kvfree(ti->buckets);
149  	kfree(ti);
150  }
151  
table_instance_alloc(int new_size)152  static struct table_instance *table_instance_alloc(int new_size)
153  {
154  	struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
155  	int i;
156  
157  	if (!ti)
158  		return NULL;
159  
160  	ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head),
161  				     GFP_KERNEL);
162  	if (!ti->buckets) {
163  		kfree(ti);
164  		return NULL;
165  	}
166  
167  	for (i = 0; i < new_size; i++)
168  		INIT_HLIST_HEAD(&ti->buckets[i]);
169  
170  	ti->n_buckets = new_size;
171  	ti->node_ver = 0;
172  	get_random_bytes(&ti->hash_seed, sizeof(u32));
173  
174  	return ti;
175  }
176  
__mask_array_destroy(struct mask_array * ma)177  static void __mask_array_destroy(struct mask_array *ma)
178  {
179  	free_percpu(ma->masks_usage_stats);
180  	kfree(ma);
181  }
182  
mask_array_rcu_cb(struct rcu_head * rcu)183  static void mask_array_rcu_cb(struct rcu_head *rcu)
184  {
185  	struct mask_array *ma = container_of(rcu, struct mask_array, rcu);
186  
187  	__mask_array_destroy(ma);
188  }
189  
tbl_mask_array_reset_counters(struct mask_array * ma)190  static void tbl_mask_array_reset_counters(struct mask_array *ma)
191  {
192  	int i, cpu;
193  
194  	/* As the per CPU counters are not atomic we can not go ahead and
195  	 * reset them from another CPU. To be able to still have an approximate
196  	 * zero based counter we store the value at reset, and subtract it
197  	 * later when processing.
198  	 */
199  	for (i = 0; i < ma->max; i++) {
200  		ma->masks_usage_zero_cntr[i] = 0;
201  
202  		for_each_possible_cpu(cpu) {
203  			struct mask_array_stats *stats;
204  			unsigned int start;
205  			u64 counter;
206  
207  			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
208  			do {
209  				start = u64_stats_fetch_begin(&stats->syncp);
210  				counter = stats->usage_cntrs[i];
211  			} while (u64_stats_fetch_retry(&stats->syncp, start));
212  
213  			ma->masks_usage_zero_cntr[i] += counter;
214  		}
215  	}
216  }
217  
tbl_mask_array_alloc(int size)218  static struct mask_array *tbl_mask_array_alloc(int size)
219  {
220  	struct mask_array *new;
221  
222  	size = max(MASK_ARRAY_SIZE_MIN, size);
223  	new = kzalloc(struct_size(new, masks, size) +
224  		      sizeof(u64) * size, GFP_KERNEL);
225  	if (!new)
226  		return NULL;
227  
228  	new->masks_usage_zero_cntr = (u64 *)((u8 *)new +
229  					     struct_size(new, masks, size));
230  
231  	new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
232  						sizeof(u64) * size,
233  						__alignof__(u64));
234  	if (!new->masks_usage_stats) {
235  		kfree(new);
236  		return NULL;
237  	}
238  
239  	new->count = 0;
240  	new->max = size;
241  
242  	return new;
243  }
244  
tbl_mask_array_realloc(struct flow_table * tbl,int size)245  static int tbl_mask_array_realloc(struct flow_table *tbl, int size)
246  {
247  	struct mask_array *old;
248  	struct mask_array *new;
249  
250  	new = tbl_mask_array_alloc(size);
251  	if (!new)
252  		return -ENOMEM;
253  
254  	old = ovsl_dereference(tbl->mask_array);
255  	if (old) {
256  		int i;
257  
258  		for (i = 0; i < old->max; i++) {
259  			if (ovsl_dereference(old->masks[i]))
260  				new->masks[new->count++] = old->masks[i];
261  		}
262  		call_rcu(&old->rcu, mask_array_rcu_cb);
263  	}
264  
265  	rcu_assign_pointer(tbl->mask_array, new);
266  
267  	return 0;
268  }
269  
tbl_mask_array_add_mask(struct flow_table * tbl,struct sw_flow_mask * new)270  static int tbl_mask_array_add_mask(struct flow_table *tbl,
271  				   struct sw_flow_mask *new)
272  {
273  	struct mask_array *ma = ovsl_dereference(tbl->mask_array);
274  	int err, ma_count = READ_ONCE(ma->count);
275  
276  	if (ma_count >= ma->max) {
277  		err = tbl_mask_array_realloc(tbl, ma->max +
278  						  MASK_ARRAY_SIZE_MIN);
279  		if (err)
280  			return err;
281  
282  		ma = ovsl_dereference(tbl->mask_array);
283  	} else {
284  		/* On every add or delete we need to reset the counters so
285  		 * every new mask gets a fair chance of being prioritized.
286  		 */
287  		tbl_mask_array_reset_counters(ma);
288  	}
289  
290  	BUG_ON(ovsl_dereference(ma->masks[ma_count]));
291  
292  	rcu_assign_pointer(ma->masks[ma_count], new);
293  	WRITE_ONCE(ma->count, ma_count + 1);
294  
295  	return 0;
296  }
297  
tbl_mask_array_del_mask(struct flow_table * tbl,struct sw_flow_mask * mask)298  static void tbl_mask_array_del_mask(struct flow_table *tbl,
299  				    struct sw_flow_mask *mask)
300  {
301  	struct mask_array *ma = ovsl_dereference(tbl->mask_array);
302  	int i, ma_count = READ_ONCE(ma->count);
303  
304  	/* Remove the deleted mask pointers from the array */
305  	for (i = 0; i < ma_count; i++) {
306  		if (mask == ovsl_dereference(ma->masks[i]))
307  			goto found;
308  	}
309  
310  	BUG();
311  	return;
312  
313  found:
314  	WRITE_ONCE(ma->count, ma_count - 1);
315  
316  	rcu_assign_pointer(ma->masks[i], ma->masks[ma_count - 1]);
317  	RCU_INIT_POINTER(ma->masks[ma_count - 1], NULL);
318  
319  	kfree_rcu(mask, rcu);
320  
321  	/* Shrink the mask array if necessary. */
322  	if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) &&
323  	    ma_count <= (ma->max / 3))
324  		tbl_mask_array_realloc(tbl, ma->max / 2);
325  	else
326  		tbl_mask_array_reset_counters(ma);
327  
328  }
329  
330  /* Remove 'mask' from the mask list, if it is not needed any more. */
flow_mask_remove(struct flow_table * tbl,struct sw_flow_mask * mask)331  static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
332  {
333  	if (mask) {
334  		/* ovs-lock is required to protect mask-refcount and
335  		 * mask list.
336  		 */
337  		ASSERT_OVSL();
338  		BUG_ON(!mask->ref_count);
339  		mask->ref_count--;
340  
341  		if (!mask->ref_count)
342  			tbl_mask_array_del_mask(tbl, mask);
343  	}
344  }
345  
__mask_cache_destroy(struct mask_cache * mc)346  static void __mask_cache_destroy(struct mask_cache *mc)
347  {
348  	free_percpu(mc->mask_cache);
349  	kfree(mc);
350  }
351  
mask_cache_rcu_cb(struct rcu_head * rcu)352  static void mask_cache_rcu_cb(struct rcu_head *rcu)
353  {
354  	struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu);
355  
356  	__mask_cache_destroy(mc);
357  }
358  
tbl_mask_cache_alloc(u32 size)359  static struct mask_cache *tbl_mask_cache_alloc(u32 size)
360  {
361  	struct mask_cache_entry __percpu *cache = NULL;
362  	struct mask_cache *new;
363  
364  	/* Only allow size to be 0, or a power of 2, and does not exceed
365  	 * percpu allocation size.
366  	 */
367  	if ((!is_power_of_2(size) && size != 0) ||
368  	    (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE)
369  		return NULL;
370  
371  	new = kzalloc(sizeof(*new), GFP_KERNEL);
372  	if (!new)
373  		return NULL;
374  
375  	new->cache_size = size;
376  	if (new->cache_size > 0) {
377  		cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry),
378  						  new->cache_size),
379  				       __alignof__(struct mask_cache_entry));
380  		if (!cache) {
381  			kfree(new);
382  			return NULL;
383  		}
384  	}
385  
386  	new->mask_cache = cache;
387  	return new;
388  }
ovs_flow_tbl_masks_cache_resize(struct flow_table * table,u32 size)389  int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size)
390  {
391  	struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
392  	struct mask_cache *new;
393  
394  	if (size == mc->cache_size)
395  		return 0;
396  
397  	if ((!is_power_of_2(size) && size != 0) ||
398  	    (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE)
399  		return -EINVAL;
400  
401  	new = tbl_mask_cache_alloc(size);
402  	if (!new)
403  		return -ENOMEM;
404  
405  	rcu_assign_pointer(table->mask_cache, new);
406  	call_rcu(&mc->rcu, mask_cache_rcu_cb);
407  
408  	return 0;
409  }
410  
ovs_flow_tbl_init(struct flow_table * table)411  int ovs_flow_tbl_init(struct flow_table *table)
412  {
413  	struct table_instance *ti, *ufid_ti;
414  	struct mask_cache *mc;
415  	struct mask_array *ma;
416  
417  	mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES);
418  	if (!mc)
419  		return -ENOMEM;
420  
421  	ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN);
422  	if (!ma)
423  		goto free_mask_cache;
424  
425  	ti = table_instance_alloc(TBL_MIN_BUCKETS);
426  	if (!ti)
427  		goto free_mask_array;
428  
429  	ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
430  	if (!ufid_ti)
431  		goto free_ti;
432  
433  	rcu_assign_pointer(table->ti, ti);
434  	rcu_assign_pointer(table->ufid_ti, ufid_ti);
435  	rcu_assign_pointer(table->mask_array, ma);
436  	rcu_assign_pointer(table->mask_cache, mc);
437  	table->last_rehash = jiffies;
438  	table->count = 0;
439  	table->ufid_count = 0;
440  	return 0;
441  
442  free_ti:
443  	__table_instance_destroy(ti);
444  free_mask_array:
445  	__mask_array_destroy(ma);
446  free_mask_cache:
447  	__mask_cache_destroy(mc);
448  	return -ENOMEM;
449  }
450  
flow_tbl_destroy_rcu_cb(struct rcu_head * rcu)451  static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
452  {
453  	struct table_instance *ti;
454  
455  	ti = container_of(rcu, struct table_instance, rcu);
456  	__table_instance_destroy(ti);
457  }
458  
table_instance_flow_free(struct flow_table * table,struct table_instance * ti,struct table_instance * ufid_ti,struct sw_flow * flow)459  static void table_instance_flow_free(struct flow_table *table,
460  				     struct table_instance *ti,
461  				     struct table_instance *ufid_ti,
462  				     struct sw_flow *flow)
463  {
464  	hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
465  	table->count--;
466  
467  	if (ovs_identifier_is_ufid(&flow->id)) {
468  		hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
469  		table->ufid_count--;
470  	}
471  
472  	flow_mask_remove(table, flow->mask);
473  }
474  
475  /* Must be called with OVS mutex held. */
table_instance_flow_flush(struct flow_table * table,struct table_instance * ti,struct table_instance * ufid_ti)476  void table_instance_flow_flush(struct flow_table *table,
477  			       struct table_instance *ti,
478  			       struct table_instance *ufid_ti)
479  {
480  	int i;
481  
482  	for (i = 0; i < ti->n_buckets; i++) {
483  		struct hlist_head *head = &ti->buckets[i];
484  		struct hlist_node *n;
485  		struct sw_flow *flow;
486  
487  		hlist_for_each_entry_safe(flow, n, head,
488  					  flow_table.node[ti->node_ver]) {
489  
490  			table_instance_flow_free(table, ti, ufid_ti,
491  						 flow);
492  			ovs_flow_free(flow, true);
493  		}
494  	}
495  
496  	if (WARN_ON(table->count != 0 ||
497  		    table->ufid_count != 0)) {
498  		table->count = 0;
499  		table->ufid_count = 0;
500  	}
501  }
502  
table_instance_destroy(struct table_instance * ti,struct table_instance * ufid_ti)503  static void table_instance_destroy(struct table_instance *ti,
504  				   struct table_instance *ufid_ti)
505  {
506  	call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
507  	call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb);
508  }
509  
510  /* No need for locking this function is called from RCU callback or
511   * error path.
512   */
ovs_flow_tbl_destroy(struct flow_table * table)513  void ovs_flow_tbl_destroy(struct flow_table *table)
514  {
515  	struct table_instance *ti = rcu_dereference_raw(table->ti);
516  	struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti);
517  	struct mask_cache *mc = rcu_dereference_raw(table->mask_cache);
518  	struct mask_array *ma = rcu_dereference_raw(table->mask_array);
519  
520  	call_rcu(&mc->rcu, mask_cache_rcu_cb);
521  	call_rcu(&ma->rcu, mask_array_rcu_cb);
522  	table_instance_destroy(ti, ufid_ti);
523  }
524  
ovs_flow_tbl_dump_next(struct table_instance * ti,u32 * bucket,u32 * last)525  struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
526  				       u32 *bucket, u32 *last)
527  {
528  	struct sw_flow *flow;
529  	struct hlist_head *head;
530  	int ver;
531  	int i;
532  
533  	ver = ti->node_ver;
534  	while (*bucket < ti->n_buckets) {
535  		i = 0;
536  		head = &ti->buckets[*bucket];
537  		hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) {
538  			if (i < *last) {
539  				i++;
540  				continue;
541  			}
542  			*last = i + 1;
543  			return flow;
544  		}
545  		(*bucket)++;
546  		*last = 0;
547  	}
548  
549  	return NULL;
550  }
551  
find_bucket(struct table_instance * ti,u32 hash)552  static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
553  {
554  	hash = jhash_1word(hash, ti->hash_seed);
555  	return &ti->buckets[hash & (ti->n_buckets - 1)];
556  }
557  
table_instance_insert(struct table_instance * ti,struct sw_flow * flow)558  static void table_instance_insert(struct table_instance *ti,
559  				  struct sw_flow *flow)
560  {
561  	struct hlist_head *head;
562  
563  	head = find_bucket(ti, flow->flow_table.hash);
564  	hlist_add_head_rcu(&flow->flow_table.node[ti->node_ver], head);
565  }
566  
ufid_table_instance_insert(struct table_instance * ti,struct sw_flow * flow)567  static void ufid_table_instance_insert(struct table_instance *ti,
568  				       struct sw_flow *flow)
569  {
570  	struct hlist_head *head;
571  
572  	head = find_bucket(ti, flow->ufid_table.hash);
573  	hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head);
574  }
575  
flow_table_copy_flows(struct table_instance * old,struct table_instance * new,bool ufid)576  static void flow_table_copy_flows(struct table_instance *old,
577  				  struct table_instance *new, bool ufid)
578  {
579  	int old_ver;
580  	int i;
581  
582  	old_ver = old->node_ver;
583  	new->node_ver = !old_ver;
584  
585  	/* Insert in new table. */
586  	for (i = 0; i < old->n_buckets; i++) {
587  		struct sw_flow *flow;
588  		struct hlist_head *head = &old->buckets[i];
589  
590  		if (ufid)
591  			hlist_for_each_entry_rcu(flow, head,
592  						 ufid_table.node[old_ver],
593  						 lockdep_ovsl_is_held())
594  				ufid_table_instance_insert(new, flow);
595  		else
596  			hlist_for_each_entry_rcu(flow, head,
597  						 flow_table.node[old_ver],
598  						 lockdep_ovsl_is_held())
599  				table_instance_insert(new, flow);
600  	}
601  }
602  
table_instance_rehash(struct table_instance * ti,int n_buckets,bool ufid)603  static struct table_instance *table_instance_rehash(struct table_instance *ti,
604  						    int n_buckets, bool ufid)
605  {
606  	struct table_instance *new_ti;
607  
608  	new_ti = table_instance_alloc(n_buckets);
609  	if (!new_ti)
610  		return NULL;
611  
612  	flow_table_copy_flows(ti, new_ti, ufid);
613  
614  	return new_ti;
615  }
616  
ovs_flow_tbl_flush(struct flow_table * flow_table)617  int ovs_flow_tbl_flush(struct flow_table *flow_table)
618  {
619  	struct table_instance *old_ti, *new_ti;
620  	struct table_instance *old_ufid_ti, *new_ufid_ti;
621  
622  	new_ti = table_instance_alloc(TBL_MIN_BUCKETS);
623  	if (!new_ti)
624  		return -ENOMEM;
625  	new_ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
626  	if (!new_ufid_ti)
627  		goto err_free_ti;
628  
629  	old_ti = ovsl_dereference(flow_table->ti);
630  	old_ufid_ti = ovsl_dereference(flow_table->ufid_ti);
631  
632  	rcu_assign_pointer(flow_table->ti, new_ti);
633  	rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti);
634  	flow_table->last_rehash = jiffies;
635  
636  	table_instance_flow_flush(flow_table, old_ti, old_ufid_ti);
637  	table_instance_destroy(old_ti, old_ufid_ti);
638  	return 0;
639  
640  err_free_ti:
641  	__table_instance_destroy(new_ti);
642  	return -ENOMEM;
643  }
644  
flow_hash(const struct sw_flow_key * key,const struct sw_flow_key_range * range)645  static u32 flow_hash(const struct sw_flow_key *key,
646  		     const struct sw_flow_key_range *range)
647  {
648  	const u32 *hash_key = (const u32 *)((const u8 *)key + range->start);
649  
650  	/* Make sure number of hash bytes are multiple of u32. */
651  	int hash_u32s = range_n_bytes(range) >> 2;
652  
653  	return jhash2(hash_key, hash_u32s, 0);
654  }
655  
flow_key_start(const struct sw_flow_key * key)656  static int flow_key_start(const struct sw_flow_key *key)
657  {
658  	if (key->tun_proto)
659  		return 0;
660  	else
661  		return rounddown(offsetof(struct sw_flow_key, phy),
662  				 sizeof(long));
663  }
664  
cmp_key(const struct sw_flow_key * key1,const struct sw_flow_key * key2,int key_start,int key_end)665  static bool cmp_key(const struct sw_flow_key *key1,
666  		    const struct sw_flow_key *key2,
667  		    int key_start, int key_end)
668  {
669  	const long *cp1 = (const long *)((const u8 *)key1 + key_start);
670  	const long *cp2 = (const long *)((const u8 *)key2 + key_start);
671  	int i;
672  
673  	for (i = key_start; i < key_end; i += sizeof(long))
674  		if (*cp1++ ^ *cp2++)
675  			return false;
676  
677  	return true;
678  }
679  
flow_cmp_masked_key(const struct sw_flow * flow,const struct sw_flow_key * key,const struct sw_flow_key_range * range)680  static bool flow_cmp_masked_key(const struct sw_flow *flow,
681  				const struct sw_flow_key *key,
682  				const struct sw_flow_key_range *range)
683  {
684  	return cmp_key(&flow->key, key, range->start, range->end);
685  }
686  
ovs_flow_cmp_unmasked_key(const struct sw_flow * flow,const struct sw_flow_match * match)687  static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
688  				      const struct sw_flow_match *match)
689  {
690  	struct sw_flow_key *key = match->key;
691  	int key_start = flow_key_start(key);
692  	int key_end = match->range.end;
693  
694  	BUG_ON(ovs_identifier_is_ufid(&flow->id));
695  	return cmp_key(flow->id.unmasked_key, key, key_start, key_end);
696  }
697  
masked_flow_lookup(struct table_instance * ti,const struct sw_flow_key * unmasked,const struct sw_flow_mask * mask,u32 * n_mask_hit)698  static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
699  					  const struct sw_flow_key *unmasked,
700  					  const struct sw_flow_mask *mask,
701  					  u32 *n_mask_hit)
702  {
703  	struct sw_flow *flow;
704  	struct hlist_head *head;
705  	u32 hash;
706  	struct sw_flow_key masked_key;
707  
708  	ovs_flow_mask_key(&masked_key, unmasked, false, mask);
709  	hash = flow_hash(&masked_key, &mask->range);
710  	head = find_bucket(ti, hash);
711  	(*n_mask_hit)++;
712  
713  	hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver],
714  				 lockdep_ovsl_is_held()) {
715  		if (flow->mask == mask && flow->flow_table.hash == hash &&
716  		    flow_cmp_masked_key(flow, &masked_key, &mask->range))
717  			return flow;
718  	}
719  	return NULL;
720  }
721  
722  /* Flow lookup does full lookup on flow table. It starts with
723   * mask from index passed in *index.
724   * This function MUST be called with BH disabled due to the use
725   * of CPU specific variables.
726   */
flow_lookup(struct flow_table * tbl,struct table_instance * ti,struct mask_array * ma,const struct sw_flow_key * key,u32 * n_mask_hit,u32 * n_cache_hit,u32 * index)727  static struct sw_flow *flow_lookup(struct flow_table *tbl,
728  				   struct table_instance *ti,
729  				   struct mask_array *ma,
730  				   const struct sw_flow_key *key,
731  				   u32 *n_mask_hit,
732  				   u32 *n_cache_hit,
733  				   u32 *index)
734  {
735  	struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
736  	struct sw_flow *flow;
737  	struct sw_flow_mask *mask;
738  	int i;
739  
740  	if (likely(*index < ma->max)) {
741  		mask = rcu_dereference_ovsl(ma->masks[*index]);
742  		if (mask) {
743  			flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
744  			if (flow) {
745  				u64_stats_update_begin(&stats->syncp);
746  				stats->usage_cntrs[*index]++;
747  				u64_stats_update_end(&stats->syncp);
748  				(*n_cache_hit)++;
749  				return flow;
750  			}
751  		}
752  	}
753  
754  	for (i = 0; i < ma->max; i++)  {
755  
756  		if (i == *index)
757  			continue;
758  
759  		mask = rcu_dereference_ovsl(ma->masks[i]);
760  		if (unlikely(!mask))
761  			break;
762  
763  		flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
764  		if (flow) { /* Found */
765  			*index = i;
766  			u64_stats_update_begin(&stats->syncp);
767  			stats->usage_cntrs[*index]++;
768  			u64_stats_update_end(&stats->syncp);
769  			return flow;
770  		}
771  	}
772  
773  	return NULL;
774  }
775  
776  /*
777   * mask_cache maps flow to probable mask. This cache is not tightly
778   * coupled cache, It means updates to  mask list can result in inconsistent
779   * cache entry in mask cache.
780   * This is per cpu cache and is divided in MC_HASH_SEGS segments.
781   * In case of a hash collision the entry is hashed in next segment.
782   * */
ovs_flow_tbl_lookup_stats(struct flow_table * tbl,const struct sw_flow_key * key,u32 skb_hash,u32 * n_mask_hit,u32 * n_cache_hit)783  struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
784  					  const struct sw_flow_key *key,
785  					  u32 skb_hash,
786  					  u32 *n_mask_hit,
787  					  u32 *n_cache_hit)
788  {
789  	struct mask_cache *mc = rcu_dereference(tbl->mask_cache);
790  	struct mask_array *ma = rcu_dereference(tbl->mask_array);
791  	struct table_instance *ti = rcu_dereference(tbl->ti);
792  	struct mask_cache_entry *entries, *ce;
793  	struct sw_flow *flow;
794  	u32 hash;
795  	int seg;
796  
797  	*n_mask_hit = 0;
798  	*n_cache_hit = 0;
799  	if (unlikely(!skb_hash || mc->cache_size == 0)) {
800  		u32 mask_index = 0;
801  		u32 cache = 0;
802  
803  		return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache,
804  				   &mask_index);
805  	}
806  
807  	/* Pre and post recirulation flows usually have the same skb_hash
808  	 * value. To avoid hash collisions, rehash the 'skb_hash' with
809  	 * 'recirc_id'.  */
810  	if (key->recirc_id)
811  		skb_hash = jhash_1word(skb_hash, key->recirc_id);
812  
813  	ce = NULL;
814  	hash = skb_hash;
815  	entries = this_cpu_ptr(mc->mask_cache);
816  
817  	/* Find the cache entry 'ce' to operate on. */
818  	for (seg = 0; seg < MC_HASH_SEGS; seg++) {
819  		int index = hash & (mc->cache_size - 1);
820  		struct mask_cache_entry *e;
821  
822  		e = &entries[index];
823  		if (e->skb_hash == skb_hash) {
824  			flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,
825  					   n_cache_hit, &e->mask_index);
826  			if (!flow)
827  				e->skb_hash = 0;
828  			return flow;
829  		}
830  
831  		if (!ce || e->skb_hash < ce->skb_hash)
832  			ce = e;  /* A better replacement cache candidate. */
833  
834  		hash >>= MC_HASH_SHIFT;
835  	}
836  
837  	/* Cache miss, do full lookup. */
838  	flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit,
839  			   &ce->mask_index);
840  	if (flow)
841  		ce->skb_hash = skb_hash;
842  
843  	*n_cache_hit = 0;
844  	return flow;
845  }
846  
ovs_flow_tbl_lookup(struct flow_table * tbl,const struct sw_flow_key * key)847  struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
848  				    const struct sw_flow_key *key)
849  {
850  	struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
851  	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
852  	u32 __always_unused n_mask_hit;
853  	u32 __always_unused n_cache_hit;
854  	struct sw_flow *flow;
855  	u32 index = 0;
856  
857  	/* This function gets called trough the netlink interface and therefore
858  	 * is preemptible. However, flow_lookup() function needs to be called
859  	 * with BH disabled due to CPU specific variables.
860  	 */
861  	local_bh_disable();
862  	flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
863  	local_bh_enable();
864  	return flow;
865  }
866  
ovs_flow_tbl_lookup_exact(struct flow_table * tbl,const struct sw_flow_match * match)867  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
868  					  const struct sw_flow_match *match)
869  {
870  	struct mask_array *ma = ovsl_dereference(tbl->mask_array);
871  	int i;
872  
873  	/* Always called under ovs-mutex. */
874  	for (i = 0; i < ma->max; i++) {
875  		struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
876  		u32 __always_unused n_mask_hit;
877  		struct sw_flow_mask *mask;
878  		struct sw_flow *flow;
879  
880  		mask = ovsl_dereference(ma->masks[i]);
881  		if (!mask)
882  			continue;
883  
884  		flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit);
885  		if (flow && ovs_identifier_is_key(&flow->id) &&
886  		    ovs_flow_cmp_unmasked_key(flow, match)) {
887  			return flow;
888  		}
889  	}
890  
891  	return NULL;
892  }
893  
ufid_hash(const struct sw_flow_id * sfid)894  static u32 ufid_hash(const struct sw_flow_id *sfid)
895  {
896  	return jhash(sfid->ufid, sfid->ufid_len, 0);
897  }
898  
ovs_flow_cmp_ufid(const struct sw_flow * flow,const struct sw_flow_id * sfid)899  static bool ovs_flow_cmp_ufid(const struct sw_flow *flow,
900  			      const struct sw_flow_id *sfid)
901  {
902  	if (flow->id.ufid_len != sfid->ufid_len)
903  		return false;
904  
905  	return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len);
906  }
907  
ovs_flow_cmp(const struct sw_flow * flow,const struct sw_flow_match * match)908  bool ovs_flow_cmp(const struct sw_flow *flow,
909  		  const struct sw_flow_match *match)
910  {
911  	if (ovs_identifier_is_ufid(&flow->id))
912  		return flow_cmp_masked_key(flow, match->key, &match->range);
913  
914  	return ovs_flow_cmp_unmasked_key(flow, match);
915  }
916  
ovs_flow_tbl_lookup_ufid(struct flow_table * tbl,const struct sw_flow_id * ufid)917  struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
918  					 const struct sw_flow_id *ufid)
919  {
920  	struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti);
921  	struct sw_flow *flow;
922  	struct hlist_head *head;
923  	u32 hash;
924  
925  	hash = ufid_hash(ufid);
926  	head = find_bucket(ti, hash);
927  	hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver],
928  				 lockdep_ovsl_is_held()) {
929  		if (flow->ufid_table.hash == hash &&
930  		    ovs_flow_cmp_ufid(flow, ufid))
931  			return flow;
932  	}
933  	return NULL;
934  }
935  
ovs_flow_tbl_num_masks(const struct flow_table * table)936  int ovs_flow_tbl_num_masks(const struct flow_table *table)
937  {
938  	struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
939  	return READ_ONCE(ma->count);
940  }
941  
ovs_flow_tbl_masks_cache_size(const struct flow_table * table)942  u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table)
943  {
944  	struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache);
945  
946  	return READ_ONCE(mc->cache_size);
947  }
948  
table_instance_expand(struct table_instance * ti,bool ufid)949  static struct table_instance *table_instance_expand(struct table_instance *ti,
950  						    bool ufid)
951  {
952  	return table_instance_rehash(ti, ti->n_buckets * 2, ufid);
953  }
954  
955  /* Must be called with OVS mutex held. */
ovs_flow_tbl_remove(struct flow_table * table,struct sw_flow * flow)956  void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
957  {
958  	struct table_instance *ti = ovsl_dereference(table->ti);
959  	struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
960  
961  	BUG_ON(table->count == 0);
962  	table_instance_flow_free(table, ti, ufid_ti, flow);
963  }
964  
mask_alloc(void)965  static struct sw_flow_mask *mask_alloc(void)
966  {
967  	struct sw_flow_mask *mask;
968  
969  	mask = kmalloc(sizeof(*mask), GFP_KERNEL);
970  	if (mask)
971  		mask->ref_count = 1;
972  
973  	return mask;
974  }
975  
mask_equal(const struct sw_flow_mask * a,const struct sw_flow_mask * b)976  static bool mask_equal(const struct sw_flow_mask *a,
977  		       const struct sw_flow_mask *b)
978  {
979  	const u8 *a_ = (const u8 *)&a->key + a->range.start;
980  	const u8 *b_ = (const u8 *)&b->key + b->range.start;
981  
982  	return  (a->range.end == b->range.end)
983  		&& (a->range.start == b->range.start)
984  		&& (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
985  }
986  
flow_mask_find(const struct flow_table * tbl,const struct sw_flow_mask * mask)987  static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
988  					   const struct sw_flow_mask *mask)
989  {
990  	struct mask_array *ma;
991  	int i;
992  
993  	ma = ovsl_dereference(tbl->mask_array);
994  	for (i = 0; i < ma->max; i++) {
995  		struct sw_flow_mask *t;
996  		t = ovsl_dereference(ma->masks[i]);
997  
998  		if (t && mask_equal(mask, t))
999  			return t;
1000  	}
1001  
1002  	return NULL;
1003  }
1004  
1005  /* Add 'mask' into the mask list, if it is not already there. */
flow_mask_insert(struct flow_table * tbl,struct sw_flow * flow,const struct sw_flow_mask * new)1006  static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
1007  			    const struct sw_flow_mask *new)
1008  {
1009  	struct sw_flow_mask *mask;
1010  
1011  	mask = flow_mask_find(tbl, new);
1012  	if (!mask) {
1013  		/* Allocate a new mask if none exists. */
1014  		mask = mask_alloc();
1015  		if (!mask)
1016  			return -ENOMEM;
1017  		mask->key = new->key;
1018  		mask->range = new->range;
1019  
1020  		/* Add mask to mask-list. */
1021  		if (tbl_mask_array_add_mask(tbl, mask)) {
1022  			kfree(mask);
1023  			return -ENOMEM;
1024  		}
1025  	} else {
1026  		BUG_ON(!mask->ref_count);
1027  		mask->ref_count++;
1028  	}
1029  
1030  	flow->mask = mask;
1031  	return 0;
1032  }
1033  
1034  /* Must be called with OVS mutex held. */
flow_key_insert(struct flow_table * table,struct sw_flow * flow)1035  static void flow_key_insert(struct flow_table *table, struct sw_flow *flow)
1036  {
1037  	struct table_instance *new_ti = NULL;
1038  	struct table_instance *ti;
1039  
1040  	flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range);
1041  	ti = ovsl_dereference(table->ti);
1042  	table_instance_insert(ti, flow);
1043  	table->count++;
1044  
1045  	/* Expand table, if necessary, to make room. */
1046  	if (table->count > ti->n_buckets)
1047  		new_ti = table_instance_expand(ti, false);
1048  	else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL))
1049  		new_ti = table_instance_rehash(ti, ti->n_buckets, false);
1050  
1051  	if (new_ti) {
1052  		rcu_assign_pointer(table->ti, new_ti);
1053  		call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
1054  		table->last_rehash = jiffies;
1055  	}
1056  }
1057  
1058  /* Must be called with OVS mutex held. */
flow_ufid_insert(struct flow_table * table,struct sw_flow * flow)1059  static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow)
1060  {
1061  	struct table_instance *ti;
1062  
1063  	flow->ufid_table.hash = ufid_hash(&flow->id);
1064  	ti = ovsl_dereference(table->ufid_ti);
1065  	ufid_table_instance_insert(ti, flow);
1066  	table->ufid_count++;
1067  
1068  	/* Expand table, if necessary, to make room. */
1069  	if (table->ufid_count > ti->n_buckets) {
1070  		struct table_instance *new_ti;
1071  
1072  		new_ti = table_instance_expand(ti, true);
1073  		if (new_ti) {
1074  			rcu_assign_pointer(table->ufid_ti, new_ti);
1075  			call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
1076  		}
1077  	}
1078  }
1079  
1080  /* Must be called with OVS mutex held. */
ovs_flow_tbl_insert(struct flow_table * table,struct sw_flow * flow,const struct sw_flow_mask * mask)1081  int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
1082  			const struct sw_flow_mask *mask)
1083  {
1084  	int err;
1085  
1086  	err = flow_mask_insert(table, flow, mask);
1087  	if (err)
1088  		return err;
1089  	flow_key_insert(table, flow);
1090  	if (ovs_identifier_is_ufid(&flow->id))
1091  		flow_ufid_insert(table, flow);
1092  
1093  	return 0;
1094  }
1095  
compare_mask_and_count(const void * a,const void * b)1096  static int compare_mask_and_count(const void *a, const void *b)
1097  {
1098  	const struct mask_count *mc_a = a;
1099  	const struct mask_count *mc_b = b;
1100  
1101  	return (s64)mc_b->counter - (s64)mc_a->counter;
1102  }
1103  
1104  /* Must be called with OVS mutex held. */
ovs_flow_masks_rebalance(struct flow_table * table)1105  void ovs_flow_masks_rebalance(struct flow_table *table)
1106  {
1107  	struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
1108  	struct mask_count *masks_and_count;
1109  	struct mask_array *new;
1110  	int masks_entries = 0;
1111  	int i;
1112  
1113  	/* Build array of all current entries with use counters. */
1114  	masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count),
1115  					GFP_KERNEL);
1116  	if (!masks_and_count)
1117  		return;
1118  
1119  	for (i = 0; i < ma->max; i++) {
1120  		struct sw_flow_mask *mask;
1121  		int cpu;
1122  
1123  		mask = rcu_dereference_ovsl(ma->masks[i]);
1124  		if (unlikely(!mask))
1125  			break;
1126  
1127  		masks_and_count[i].index = i;
1128  		masks_and_count[i].counter = 0;
1129  
1130  		for_each_possible_cpu(cpu) {
1131  			struct mask_array_stats *stats;
1132  			unsigned int start;
1133  			u64 counter;
1134  
1135  			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
1136  			do {
1137  				start = u64_stats_fetch_begin(&stats->syncp);
1138  				counter = stats->usage_cntrs[i];
1139  			} while (u64_stats_fetch_retry(&stats->syncp, start));
1140  
1141  			masks_and_count[i].counter += counter;
1142  		}
1143  
1144  		/* Subtract the zero count value. */
1145  		masks_and_count[i].counter -= ma->masks_usage_zero_cntr[i];
1146  
1147  		/* Rather than calling tbl_mask_array_reset_counters()
1148  		 * below when no change is needed, do it inline here.
1149  		 */
1150  		ma->masks_usage_zero_cntr[i] += masks_and_count[i].counter;
1151  	}
1152  
1153  	if (i == 0)
1154  		goto free_mask_entries;
1155  
1156  	/* Sort the entries */
1157  	masks_entries = i;
1158  	sort(masks_and_count, masks_entries, sizeof(*masks_and_count),
1159  	     compare_mask_and_count, NULL);
1160  
1161  	/* If the order is the same, nothing to do... */
1162  	for (i = 0; i < masks_entries; i++) {
1163  		if (i != masks_and_count[i].index)
1164  			break;
1165  	}
1166  	if (i == masks_entries)
1167  		goto free_mask_entries;
1168  
1169  	/* Rebuilt the new list in order of usage. */
1170  	new = tbl_mask_array_alloc(ma->max);
1171  	if (!new)
1172  		goto free_mask_entries;
1173  
1174  	for (i = 0; i < masks_entries; i++) {
1175  		int index = masks_and_count[i].index;
1176  
1177  		if (ovsl_dereference(ma->masks[index]))
1178  			new->masks[new->count++] = ma->masks[index];
1179  	}
1180  
1181  	rcu_assign_pointer(table->mask_array, new);
1182  	call_rcu(&ma->rcu, mask_array_rcu_cb);
1183  
1184  free_mask_entries:
1185  	kfree(masks_and_count);
1186  }
1187  
1188  /* Initializes the flow module.
1189   * Returns zero if successful or a negative error code. */
ovs_flow_init(void)1190  int ovs_flow_init(void)
1191  {
1192  	BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
1193  	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));
1194  
1195  	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
1196  				       + (nr_cpu_ids
1197  					  * sizeof(struct sw_flow_stats *))
1198  				       + cpumask_size(),
1199  				       0, 0, NULL);
1200  	if (flow_cache == NULL)
1201  		return -ENOMEM;
1202  
1203  	flow_stats_cache
1204  		= kmem_cache_create("sw_flow_stats", sizeof(struct sw_flow_stats),
1205  				    0, SLAB_HWCACHE_ALIGN, NULL);
1206  	if (flow_stats_cache == NULL) {
1207  		kmem_cache_destroy(flow_cache);
1208  		flow_cache = NULL;
1209  		return -ENOMEM;
1210  	}
1211  
1212  	return 0;
1213  }
1214  
1215  /* Uninitializes the flow module. */
ovs_flow_exit(void)1216  void ovs_flow_exit(void)
1217  {
1218  	kmem_cache_destroy(flow_stats_cache);
1219  	kmem_cache_destroy(flow_cache);
1220  }
1221