xref: /linux/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c (revision 372e2db7210df7c45ead46429aeb1443ba148060)
1 /*
2  * drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
3  * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
4  * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
5  * Copyright (c) 2016 Ido Schimmel <idosch@mellanox.com>
6  * Copyright (c) 2016 Yotam Gigi <yotamg@mellanox.com>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the names of the copyright holders nor the names of its
17  *    contributors may be used to endorse or promote products derived from
18  *    this software without specific prior written permission.
19  *
20  * Alternatively, this software may be distributed under the terms of the
21  * GNU General Public License ("GPL") version 2 as published by the Free
22  * Software Foundation.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
25  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
28  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
29  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
30  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
31  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
32  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34  * POSSIBILITY OF SUCH DAMAGE.
35  */
36 
37 #include <linux/kernel.h>
38 #include <linux/types.h>
39 #include <linux/rhashtable.h>
40 #include <linux/bitops.h>
41 #include <linux/in6.h>
42 #include <linux/notifier.h>
43 #include <net/netevent.h>
44 #include <net/neighbour.h>
45 #include <net/arp.h>
46 #include <net/ip_fib.h>
47 
48 #include "spectrum.h"
49 #include "core.h"
50 #include "reg.h"
51 
52 #define mlxsw_sp_prefix_usage_for_each(prefix, prefix_usage) \
53 	for_each_set_bit(prefix, (prefix_usage)->b, MLXSW_SP_PREFIX_COUNT)
54 
55 static bool
56 mlxsw_sp_prefix_usage_subset(struct mlxsw_sp_prefix_usage *prefix_usage1,
57 			     struct mlxsw_sp_prefix_usage *prefix_usage2)
58 {
59 	unsigned char prefix;
60 
61 	mlxsw_sp_prefix_usage_for_each(prefix, prefix_usage1) {
62 		if (!test_bit(prefix, prefix_usage2->b))
63 			return false;
64 	}
65 	return true;
66 }
67 
68 static bool
69 mlxsw_sp_prefix_usage_eq(struct mlxsw_sp_prefix_usage *prefix_usage1,
70 			 struct mlxsw_sp_prefix_usage *prefix_usage2)
71 {
72 	return !memcmp(prefix_usage1, prefix_usage2, sizeof(*prefix_usage1));
73 }
74 
75 static bool
76 mlxsw_sp_prefix_usage_none(struct mlxsw_sp_prefix_usage *prefix_usage)
77 {
78 	struct mlxsw_sp_prefix_usage prefix_usage_none = {{ 0 } };
79 
80 	return mlxsw_sp_prefix_usage_eq(prefix_usage, &prefix_usage_none);
81 }
82 
83 static void
84 mlxsw_sp_prefix_usage_cpy(struct mlxsw_sp_prefix_usage *prefix_usage1,
85 			  struct mlxsw_sp_prefix_usage *prefix_usage2)
86 {
87 	memcpy(prefix_usage1, prefix_usage2, sizeof(*prefix_usage1));
88 }
89 
90 static void
91 mlxsw_sp_prefix_usage_zero(struct mlxsw_sp_prefix_usage *prefix_usage)
92 {
93 	memset(prefix_usage, 0, sizeof(*prefix_usage));
94 }
95 
96 static void
97 mlxsw_sp_prefix_usage_set(struct mlxsw_sp_prefix_usage *prefix_usage,
98 			  unsigned char prefix_len)
99 {
100 	set_bit(prefix_len, prefix_usage->b);
101 }
102 
103 static void
104 mlxsw_sp_prefix_usage_clear(struct mlxsw_sp_prefix_usage *prefix_usage,
105 			    unsigned char prefix_len)
106 {
107 	clear_bit(prefix_len, prefix_usage->b);
108 }
109 
110 struct mlxsw_sp_fib_key {
111 	struct net_device *dev;
112 	unsigned char addr[sizeof(struct in6_addr)];
113 	unsigned char prefix_len;
114 };
115 
116 enum mlxsw_sp_fib_entry_type {
117 	MLXSW_SP_FIB_ENTRY_TYPE_REMOTE,
118 	MLXSW_SP_FIB_ENTRY_TYPE_LOCAL,
119 	MLXSW_SP_FIB_ENTRY_TYPE_TRAP,
120 };
121 
122 struct mlxsw_sp_nexthop_group;
123 
124 struct mlxsw_sp_fib_entry {
125 	struct rhash_head ht_node;
126 	struct list_head list;
127 	struct mlxsw_sp_fib_key key;
128 	enum mlxsw_sp_fib_entry_type type;
129 	unsigned int ref_count;
130 	u16 rif; /* used for action local */
131 	struct mlxsw_sp_vr *vr;
132 	struct fib_info *fi;
133 	struct list_head nexthop_group_node;
134 	struct mlxsw_sp_nexthop_group *nh_group;
135 };
136 
137 struct mlxsw_sp_fib {
138 	struct rhashtable ht;
139 	struct list_head entry_list;
140 	unsigned long prefix_ref_count[MLXSW_SP_PREFIX_COUNT];
141 	struct mlxsw_sp_prefix_usage prefix_usage;
142 };
143 
144 static const struct rhashtable_params mlxsw_sp_fib_ht_params = {
145 	.key_offset = offsetof(struct mlxsw_sp_fib_entry, key),
146 	.head_offset = offsetof(struct mlxsw_sp_fib_entry, ht_node),
147 	.key_len = sizeof(struct mlxsw_sp_fib_key),
148 	.automatic_shrinking = true,
149 };
150 
151 static int mlxsw_sp_fib_entry_insert(struct mlxsw_sp_fib *fib,
152 				     struct mlxsw_sp_fib_entry *fib_entry)
153 {
154 	unsigned char prefix_len = fib_entry->key.prefix_len;
155 	int err;
156 
157 	err = rhashtable_insert_fast(&fib->ht, &fib_entry->ht_node,
158 				     mlxsw_sp_fib_ht_params);
159 	if (err)
160 		return err;
161 	list_add_tail(&fib_entry->list, &fib->entry_list);
162 	if (fib->prefix_ref_count[prefix_len]++ == 0)
163 		mlxsw_sp_prefix_usage_set(&fib->prefix_usage, prefix_len);
164 	return 0;
165 }
166 
167 static void mlxsw_sp_fib_entry_remove(struct mlxsw_sp_fib *fib,
168 				      struct mlxsw_sp_fib_entry *fib_entry)
169 {
170 	unsigned char prefix_len = fib_entry->key.prefix_len;
171 
172 	if (--fib->prefix_ref_count[prefix_len] == 0)
173 		mlxsw_sp_prefix_usage_clear(&fib->prefix_usage, prefix_len);
174 	list_del(&fib_entry->list);
175 	rhashtable_remove_fast(&fib->ht, &fib_entry->ht_node,
176 			       mlxsw_sp_fib_ht_params);
177 }
178 
179 static struct mlxsw_sp_fib_entry *
180 mlxsw_sp_fib_entry_create(struct mlxsw_sp_fib *fib, const void *addr,
181 			  size_t addr_len, unsigned char prefix_len,
182 			  struct net_device *dev)
183 {
184 	struct mlxsw_sp_fib_entry *fib_entry;
185 
186 	fib_entry = kzalloc(sizeof(*fib_entry), GFP_KERNEL);
187 	if (!fib_entry)
188 		return NULL;
189 	fib_entry->key.dev = dev;
190 	memcpy(fib_entry->key.addr, addr, addr_len);
191 	fib_entry->key.prefix_len = prefix_len;
192 	return fib_entry;
193 }
194 
195 static void mlxsw_sp_fib_entry_destroy(struct mlxsw_sp_fib_entry *fib_entry)
196 {
197 	kfree(fib_entry);
198 }
199 
200 static struct mlxsw_sp_fib_entry *
201 mlxsw_sp_fib_entry_lookup(struct mlxsw_sp_fib *fib, const void *addr,
202 			  size_t addr_len, unsigned char prefix_len,
203 			  struct net_device *dev)
204 {
205 	struct mlxsw_sp_fib_key key;
206 
207 	memset(&key, 0, sizeof(key));
208 	key.dev = dev;
209 	memcpy(key.addr, addr, addr_len);
210 	key.prefix_len = prefix_len;
211 	return rhashtable_lookup_fast(&fib->ht, &key, mlxsw_sp_fib_ht_params);
212 }
213 
214 static struct mlxsw_sp_fib *mlxsw_sp_fib_create(void)
215 {
216 	struct mlxsw_sp_fib *fib;
217 	int err;
218 
219 	fib = kzalloc(sizeof(*fib), GFP_KERNEL);
220 	if (!fib)
221 		return ERR_PTR(-ENOMEM);
222 	err = rhashtable_init(&fib->ht, &mlxsw_sp_fib_ht_params);
223 	if (err)
224 		goto err_rhashtable_init;
225 	INIT_LIST_HEAD(&fib->entry_list);
226 	return fib;
227 
228 err_rhashtable_init:
229 	kfree(fib);
230 	return ERR_PTR(err);
231 }
232 
233 static void mlxsw_sp_fib_destroy(struct mlxsw_sp_fib *fib)
234 {
235 	rhashtable_destroy(&fib->ht);
236 	kfree(fib);
237 }
238 
239 static struct mlxsw_sp_lpm_tree *
240 mlxsw_sp_lpm_tree_find_unused(struct mlxsw_sp *mlxsw_sp, bool one_reserved)
241 {
242 	static struct mlxsw_sp_lpm_tree *lpm_tree;
243 	int i;
244 
245 	for (i = 0; i < MLXSW_SP_LPM_TREE_COUNT; i++) {
246 		lpm_tree = &mlxsw_sp->router.lpm_trees[i];
247 		if (lpm_tree->ref_count == 0) {
248 			if (one_reserved)
249 				one_reserved = false;
250 			else
251 				return lpm_tree;
252 		}
253 	}
254 	return NULL;
255 }
256 
257 static int mlxsw_sp_lpm_tree_alloc(struct mlxsw_sp *mlxsw_sp,
258 				   struct mlxsw_sp_lpm_tree *lpm_tree)
259 {
260 	char ralta_pl[MLXSW_REG_RALTA_LEN];
261 
262 	mlxsw_reg_ralta_pack(ralta_pl, true,
263 			     (enum mlxsw_reg_ralxx_protocol) lpm_tree->proto,
264 			     lpm_tree->id);
265 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralta), ralta_pl);
266 }
267 
268 static int mlxsw_sp_lpm_tree_free(struct mlxsw_sp *mlxsw_sp,
269 				  struct mlxsw_sp_lpm_tree *lpm_tree)
270 {
271 	char ralta_pl[MLXSW_REG_RALTA_LEN];
272 
273 	mlxsw_reg_ralta_pack(ralta_pl, false,
274 			     (enum mlxsw_reg_ralxx_protocol) lpm_tree->proto,
275 			     lpm_tree->id);
276 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralta), ralta_pl);
277 }
278 
279 static int
280 mlxsw_sp_lpm_tree_left_struct_set(struct mlxsw_sp *mlxsw_sp,
281 				  struct mlxsw_sp_prefix_usage *prefix_usage,
282 				  struct mlxsw_sp_lpm_tree *lpm_tree)
283 {
284 	char ralst_pl[MLXSW_REG_RALST_LEN];
285 	u8 root_bin = 0;
286 	u8 prefix;
287 	u8 last_prefix = MLXSW_REG_RALST_BIN_NO_CHILD;
288 
289 	mlxsw_sp_prefix_usage_for_each(prefix, prefix_usage)
290 		root_bin = prefix;
291 
292 	mlxsw_reg_ralst_pack(ralst_pl, root_bin, lpm_tree->id);
293 	mlxsw_sp_prefix_usage_for_each(prefix, prefix_usage) {
294 		if (prefix == 0)
295 			continue;
296 		mlxsw_reg_ralst_bin_pack(ralst_pl, prefix, last_prefix,
297 					 MLXSW_REG_RALST_BIN_NO_CHILD);
298 		last_prefix = prefix;
299 	}
300 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralst), ralst_pl);
301 }
302 
303 static struct mlxsw_sp_lpm_tree *
304 mlxsw_sp_lpm_tree_create(struct mlxsw_sp *mlxsw_sp,
305 			 struct mlxsw_sp_prefix_usage *prefix_usage,
306 			 enum mlxsw_sp_l3proto proto, bool one_reserved)
307 {
308 	struct mlxsw_sp_lpm_tree *lpm_tree;
309 	int err;
310 
311 	lpm_tree = mlxsw_sp_lpm_tree_find_unused(mlxsw_sp, one_reserved);
312 	if (!lpm_tree)
313 		return ERR_PTR(-EBUSY);
314 	lpm_tree->proto = proto;
315 	err = mlxsw_sp_lpm_tree_alloc(mlxsw_sp, lpm_tree);
316 	if (err)
317 		return ERR_PTR(err);
318 
319 	err = mlxsw_sp_lpm_tree_left_struct_set(mlxsw_sp, prefix_usage,
320 						lpm_tree);
321 	if (err)
322 		goto err_left_struct_set;
323 	memcpy(&lpm_tree->prefix_usage, prefix_usage,
324 	       sizeof(lpm_tree->prefix_usage));
325 	return lpm_tree;
326 
327 err_left_struct_set:
328 	mlxsw_sp_lpm_tree_free(mlxsw_sp, lpm_tree);
329 	return ERR_PTR(err);
330 }
331 
332 static int mlxsw_sp_lpm_tree_destroy(struct mlxsw_sp *mlxsw_sp,
333 				     struct mlxsw_sp_lpm_tree *lpm_tree)
334 {
335 	return mlxsw_sp_lpm_tree_free(mlxsw_sp, lpm_tree);
336 }
337 
338 static struct mlxsw_sp_lpm_tree *
339 mlxsw_sp_lpm_tree_get(struct mlxsw_sp *mlxsw_sp,
340 		      struct mlxsw_sp_prefix_usage *prefix_usage,
341 		      enum mlxsw_sp_l3proto proto, bool one_reserved)
342 {
343 	struct mlxsw_sp_lpm_tree *lpm_tree;
344 	int i;
345 
346 	for (i = 0; i < MLXSW_SP_LPM_TREE_COUNT; i++) {
347 		lpm_tree = &mlxsw_sp->router.lpm_trees[i];
348 		if (lpm_tree->ref_count != 0 &&
349 		    lpm_tree->proto == proto &&
350 		    mlxsw_sp_prefix_usage_eq(&lpm_tree->prefix_usage,
351 					     prefix_usage))
352 			goto inc_ref_count;
353 	}
354 	lpm_tree = mlxsw_sp_lpm_tree_create(mlxsw_sp, prefix_usage,
355 					    proto, one_reserved);
356 	if (IS_ERR(lpm_tree))
357 		return lpm_tree;
358 
359 inc_ref_count:
360 	lpm_tree->ref_count++;
361 	return lpm_tree;
362 }
363 
364 static int mlxsw_sp_lpm_tree_put(struct mlxsw_sp *mlxsw_sp,
365 				 struct mlxsw_sp_lpm_tree *lpm_tree)
366 {
367 	if (--lpm_tree->ref_count == 0)
368 		return mlxsw_sp_lpm_tree_destroy(mlxsw_sp, lpm_tree);
369 	return 0;
370 }
371 
372 static void mlxsw_sp_lpm_init(struct mlxsw_sp *mlxsw_sp)
373 {
374 	struct mlxsw_sp_lpm_tree *lpm_tree;
375 	int i;
376 
377 	for (i = 0; i < MLXSW_SP_LPM_TREE_COUNT; i++) {
378 		lpm_tree = &mlxsw_sp->router.lpm_trees[i];
379 		lpm_tree->id = i + MLXSW_SP_LPM_TREE_MIN;
380 	}
381 }
382 
383 static struct mlxsw_sp_vr *mlxsw_sp_vr_find_unused(struct mlxsw_sp *mlxsw_sp)
384 {
385 	struct mlxsw_sp_vr *vr;
386 	int i;
387 
388 	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
389 		vr = &mlxsw_sp->router.vrs[i];
390 		if (!vr->used)
391 			return vr;
392 	}
393 	return NULL;
394 }
395 
396 static int mlxsw_sp_vr_lpm_tree_bind(struct mlxsw_sp *mlxsw_sp,
397 				     struct mlxsw_sp_vr *vr)
398 {
399 	char raltb_pl[MLXSW_REG_RALTB_LEN];
400 
401 	mlxsw_reg_raltb_pack(raltb_pl, vr->id,
402 			     (enum mlxsw_reg_ralxx_protocol) vr->proto,
403 			     vr->lpm_tree->id);
404 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raltb), raltb_pl);
405 }
406 
407 static int mlxsw_sp_vr_lpm_tree_unbind(struct mlxsw_sp *mlxsw_sp,
408 				       struct mlxsw_sp_vr *vr)
409 {
410 	char raltb_pl[MLXSW_REG_RALTB_LEN];
411 
412 	/* Bind to tree 0 which is default */
413 	mlxsw_reg_raltb_pack(raltb_pl, vr->id,
414 			     (enum mlxsw_reg_ralxx_protocol) vr->proto, 0);
415 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raltb), raltb_pl);
416 }
417 
418 static u32 mlxsw_sp_fix_tb_id(u32 tb_id)
419 {
420 	/* For our purpose, squash main and local table into one */
421 	if (tb_id == RT_TABLE_LOCAL)
422 		tb_id = RT_TABLE_MAIN;
423 	return tb_id;
424 }
425 
426 static struct mlxsw_sp_vr *mlxsw_sp_vr_find(struct mlxsw_sp *mlxsw_sp,
427 					    u32 tb_id,
428 					    enum mlxsw_sp_l3proto proto)
429 {
430 	struct mlxsw_sp_vr *vr;
431 	int i;
432 
433 	tb_id = mlxsw_sp_fix_tb_id(tb_id);
434 
435 	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
436 		vr = &mlxsw_sp->router.vrs[i];
437 		if (vr->used && vr->proto == proto && vr->tb_id == tb_id)
438 			return vr;
439 	}
440 	return NULL;
441 }
442 
443 static struct mlxsw_sp_vr *mlxsw_sp_vr_create(struct mlxsw_sp *mlxsw_sp,
444 					      unsigned char prefix_len,
445 					      u32 tb_id,
446 					      enum mlxsw_sp_l3proto proto)
447 {
448 	struct mlxsw_sp_prefix_usage req_prefix_usage;
449 	struct mlxsw_sp_lpm_tree *lpm_tree;
450 	struct mlxsw_sp_vr *vr;
451 	int err;
452 
453 	vr = mlxsw_sp_vr_find_unused(mlxsw_sp);
454 	if (!vr)
455 		return ERR_PTR(-EBUSY);
456 	vr->fib = mlxsw_sp_fib_create();
457 	if (IS_ERR(vr->fib))
458 		return ERR_CAST(vr->fib);
459 
460 	vr->proto = proto;
461 	vr->tb_id = tb_id;
462 	mlxsw_sp_prefix_usage_zero(&req_prefix_usage);
463 	mlxsw_sp_prefix_usage_set(&req_prefix_usage, prefix_len);
464 	lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, &req_prefix_usage,
465 					 proto, true);
466 	if (IS_ERR(lpm_tree)) {
467 		err = PTR_ERR(lpm_tree);
468 		goto err_tree_get;
469 	}
470 	vr->lpm_tree = lpm_tree;
471 	err = mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, vr);
472 	if (err)
473 		goto err_tree_bind;
474 
475 	vr->used = true;
476 	return vr;
477 
478 err_tree_bind:
479 	mlxsw_sp_lpm_tree_put(mlxsw_sp, vr->lpm_tree);
480 err_tree_get:
481 	mlxsw_sp_fib_destroy(vr->fib);
482 
483 	return ERR_PTR(err);
484 }
485 
486 static void mlxsw_sp_vr_destroy(struct mlxsw_sp *mlxsw_sp,
487 				struct mlxsw_sp_vr *vr)
488 {
489 	mlxsw_sp_vr_lpm_tree_unbind(mlxsw_sp, vr);
490 	mlxsw_sp_lpm_tree_put(mlxsw_sp, vr->lpm_tree);
491 	mlxsw_sp_fib_destroy(vr->fib);
492 	vr->used = false;
493 }
494 
495 static int
496 mlxsw_sp_vr_lpm_tree_check(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_vr *vr,
497 			   struct mlxsw_sp_prefix_usage *req_prefix_usage)
498 {
499 	struct mlxsw_sp_lpm_tree *lpm_tree;
500 
501 	if (mlxsw_sp_prefix_usage_eq(req_prefix_usage,
502 				     &vr->lpm_tree->prefix_usage))
503 		return 0;
504 
505 	lpm_tree = mlxsw_sp_lpm_tree_get(mlxsw_sp, req_prefix_usage,
506 					 vr->proto, false);
507 	if (IS_ERR(lpm_tree)) {
508 		/* We failed to get a tree according to the required
509 		 * prefix usage. However, the current tree might be still good
510 		 * for us if our requirement is subset of the prefixes used
511 		 * in the tree.
512 		 */
513 		if (mlxsw_sp_prefix_usage_subset(req_prefix_usage,
514 						 &vr->lpm_tree->prefix_usage))
515 			return 0;
516 		return PTR_ERR(lpm_tree);
517 	}
518 
519 	mlxsw_sp_vr_lpm_tree_unbind(mlxsw_sp, vr);
520 	mlxsw_sp_lpm_tree_put(mlxsw_sp, vr->lpm_tree);
521 	vr->lpm_tree = lpm_tree;
522 	return mlxsw_sp_vr_lpm_tree_bind(mlxsw_sp, vr);
523 }
524 
525 static struct mlxsw_sp_vr *mlxsw_sp_vr_get(struct mlxsw_sp *mlxsw_sp,
526 					   unsigned char prefix_len,
527 					   u32 tb_id,
528 					   enum mlxsw_sp_l3proto proto)
529 {
530 	struct mlxsw_sp_vr *vr;
531 	int err;
532 
533 	tb_id = mlxsw_sp_fix_tb_id(tb_id);
534 	vr = mlxsw_sp_vr_find(mlxsw_sp, tb_id, proto);
535 	if (!vr) {
536 		vr = mlxsw_sp_vr_create(mlxsw_sp, prefix_len, tb_id, proto);
537 		if (IS_ERR(vr))
538 			return vr;
539 	} else {
540 		struct mlxsw_sp_prefix_usage req_prefix_usage;
541 
542 		mlxsw_sp_prefix_usage_cpy(&req_prefix_usage,
543 					  &vr->fib->prefix_usage);
544 		mlxsw_sp_prefix_usage_set(&req_prefix_usage, prefix_len);
545 		/* Need to replace LPM tree in case new prefix is required. */
546 		err = mlxsw_sp_vr_lpm_tree_check(mlxsw_sp, vr,
547 						 &req_prefix_usage);
548 		if (err)
549 			return ERR_PTR(err);
550 	}
551 	return vr;
552 }
553 
554 static void mlxsw_sp_vr_put(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_vr *vr)
555 {
556 	/* Destroy virtual router entity in case the associated FIB is empty
557 	 * and allow it to be used for other tables in future. Otherwise,
558 	 * check if some prefix usage did not disappear and change tree if
559 	 * that is the case. Note that in case new, smaller tree cannot be
560 	 * allocated, the original one will be kept being used.
561 	 */
562 	if (mlxsw_sp_prefix_usage_none(&vr->fib->prefix_usage))
563 		mlxsw_sp_vr_destroy(mlxsw_sp, vr);
564 	else
565 		mlxsw_sp_vr_lpm_tree_check(mlxsw_sp, vr,
566 					   &vr->fib->prefix_usage);
567 }
568 
569 static int mlxsw_sp_vrs_init(struct mlxsw_sp *mlxsw_sp)
570 {
571 	struct mlxsw_sp_vr *vr;
572 	u64 max_vrs;
573 	int i;
574 
575 	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_VRS))
576 		return -EIO;
577 
578 	max_vrs = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS);
579 	mlxsw_sp->router.vrs = kcalloc(max_vrs, sizeof(struct mlxsw_sp_vr),
580 				       GFP_KERNEL);
581 	if (!mlxsw_sp->router.vrs)
582 		return -ENOMEM;
583 
584 	for (i = 0; i < max_vrs; i++) {
585 		vr = &mlxsw_sp->router.vrs[i];
586 		vr->id = i;
587 	}
588 
589 	return 0;
590 }
591 
592 static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp);
593 
594 static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp)
595 {
596 	/* At this stage we're guaranteed not to have new incoming
597 	 * FIB notifications and the work queue is free from FIBs
598 	 * sitting on top of mlxsw netdevs. However, we can still
599 	 * have other FIBs queued. Flush the queue before flushing
600 	 * the device's tables. No need for locks, as we're the only
601 	 * writer.
602 	 */
603 	mlxsw_core_flush_owq();
604 	mlxsw_sp_router_fib_flush(mlxsw_sp);
605 	kfree(mlxsw_sp->router.vrs);
606 }
607 
608 struct mlxsw_sp_neigh_key {
609 	struct neighbour *n;
610 };
611 
612 struct mlxsw_sp_neigh_entry {
613 	struct rhash_head ht_node;
614 	struct mlxsw_sp_neigh_key key;
615 	u16 rif;
616 	bool offloaded;
617 	struct delayed_work dw;
618 	struct mlxsw_sp_port *mlxsw_sp_port;
619 	unsigned char ha[ETH_ALEN];
620 	struct list_head nexthop_list; /* list of nexthops using
621 					* this neigh entry
622 					*/
623 	struct list_head nexthop_neighs_list_node;
624 };
625 
626 static const struct rhashtable_params mlxsw_sp_neigh_ht_params = {
627 	.key_offset = offsetof(struct mlxsw_sp_neigh_entry, key),
628 	.head_offset = offsetof(struct mlxsw_sp_neigh_entry, ht_node),
629 	.key_len = sizeof(struct mlxsw_sp_neigh_key),
630 };
631 
632 static int
633 mlxsw_sp_neigh_entry_insert(struct mlxsw_sp *mlxsw_sp,
634 			    struct mlxsw_sp_neigh_entry *neigh_entry)
635 {
636 	return rhashtable_insert_fast(&mlxsw_sp->router.neigh_ht,
637 				      &neigh_entry->ht_node,
638 				      mlxsw_sp_neigh_ht_params);
639 }
640 
641 static void
642 mlxsw_sp_neigh_entry_remove(struct mlxsw_sp *mlxsw_sp,
643 			    struct mlxsw_sp_neigh_entry *neigh_entry)
644 {
645 	rhashtable_remove_fast(&mlxsw_sp->router.neigh_ht,
646 			       &neigh_entry->ht_node,
647 			       mlxsw_sp_neigh_ht_params);
648 }
649 
650 static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work);
651 
652 static struct mlxsw_sp_neigh_entry *
653 mlxsw_sp_neigh_entry_create(struct neighbour *n, u16 rif)
654 {
655 	struct mlxsw_sp_neigh_entry *neigh_entry;
656 
657 	neigh_entry = kzalloc(sizeof(*neigh_entry), GFP_ATOMIC);
658 	if (!neigh_entry)
659 		return NULL;
660 	neigh_entry->key.n = n;
661 	neigh_entry->rif = rif;
662 	INIT_DELAYED_WORK(&neigh_entry->dw, mlxsw_sp_router_neigh_update_hw);
663 	INIT_LIST_HEAD(&neigh_entry->nexthop_list);
664 	return neigh_entry;
665 }
666 
667 static void
668 mlxsw_sp_neigh_entry_destroy(struct mlxsw_sp_neigh_entry *neigh_entry)
669 {
670 	kfree(neigh_entry);
671 }
672 
673 static struct mlxsw_sp_neigh_entry *
674 mlxsw_sp_neigh_entry_lookup(struct mlxsw_sp *mlxsw_sp, struct neighbour *n)
675 {
676 	struct mlxsw_sp_neigh_key key;
677 
678 	key.n = n;
679 	return rhashtable_lookup_fast(&mlxsw_sp->router.neigh_ht,
680 				      &key, mlxsw_sp_neigh_ht_params);
681 }
682 
683 int mlxsw_sp_router_neigh_construct(struct net_device *dev,
684 				    struct neighbour *n)
685 {
686 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
687 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
688 	struct mlxsw_sp_neigh_entry *neigh_entry;
689 	struct mlxsw_sp_rif *r;
690 	int err;
691 
692 	if (n->tbl != &arp_tbl)
693 		return 0;
694 
695 	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
696 	if (neigh_entry)
697 		return 0;
698 
699 	r = mlxsw_sp_rif_find_by_dev(mlxsw_sp, n->dev);
700 	if (WARN_ON(!r))
701 		return -EINVAL;
702 
703 	neigh_entry = mlxsw_sp_neigh_entry_create(n, r->rif);
704 	if (!neigh_entry)
705 		return -ENOMEM;
706 	err = mlxsw_sp_neigh_entry_insert(mlxsw_sp, neigh_entry);
707 	if (err)
708 		goto err_neigh_entry_insert;
709 	return 0;
710 
711 err_neigh_entry_insert:
712 	mlxsw_sp_neigh_entry_destroy(neigh_entry);
713 	return err;
714 }
715 
716 void mlxsw_sp_router_neigh_destroy(struct net_device *dev,
717 				   struct neighbour *n)
718 {
719 	struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev);
720 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
721 	struct mlxsw_sp_neigh_entry *neigh_entry;
722 
723 	if (n->tbl != &arp_tbl)
724 		return;
725 
726 	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
727 	if (!neigh_entry)
728 		return;
729 	mlxsw_sp_neigh_entry_remove(mlxsw_sp, neigh_entry);
730 	mlxsw_sp_neigh_entry_destroy(neigh_entry);
731 }
732 
733 static void
734 mlxsw_sp_router_neighs_update_interval_init(struct mlxsw_sp *mlxsw_sp)
735 {
736 	unsigned long interval = NEIGH_VAR(&arp_tbl.parms, DELAY_PROBE_TIME);
737 
738 	mlxsw_sp->router.neighs_update.interval = jiffies_to_msecs(interval);
739 }
740 
741 static void mlxsw_sp_router_neigh_ent_ipv4_process(struct mlxsw_sp *mlxsw_sp,
742 						   char *rauhtd_pl,
743 						   int ent_index)
744 {
745 	struct net_device *dev;
746 	struct neighbour *n;
747 	__be32 dipn;
748 	u32 dip;
749 	u16 rif;
750 
751 	mlxsw_reg_rauhtd_ent_ipv4_unpack(rauhtd_pl, ent_index, &rif, &dip);
752 
753 	if (!mlxsw_sp->rifs[rif]) {
754 		dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Incorrect RIF in neighbour entry\n");
755 		return;
756 	}
757 
758 	dipn = htonl(dip);
759 	dev = mlxsw_sp->rifs[rif]->dev;
760 	n = neigh_lookup(&arp_tbl, &dipn, dev);
761 	if (!n) {
762 		netdev_err(dev, "Failed to find matching neighbour for IP=%pI4h\n",
763 			   &dip);
764 		return;
765 	}
766 
767 	netdev_dbg(dev, "Updating neighbour with IP=%pI4h\n", &dip);
768 	neigh_event_send(n, NULL);
769 	neigh_release(n);
770 }
771 
772 static void mlxsw_sp_router_neigh_rec_ipv4_process(struct mlxsw_sp *mlxsw_sp,
773 						   char *rauhtd_pl,
774 						   int rec_index)
775 {
776 	u8 num_entries;
777 	int i;
778 
779 	num_entries = mlxsw_reg_rauhtd_ipv4_rec_num_entries_get(rauhtd_pl,
780 								rec_index);
781 	/* Hardware starts counting at 0, so add 1. */
782 	num_entries++;
783 
784 	/* Each record consists of several neighbour entries. */
785 	for (i = 0; i < num_entries; i++) {
786 		int ent_index;
787 
788 		ent_index = rec_index * MLXSW_REG_RAUHTD_IPV4_ENT_PER_REC + i;
789 		mlxsw_sp_router_neigh_ent_ipv4_process(mlxsw_sp, rauhtd_pl,
790 						       ent_index);
791 	}
792 
793 }
794 
795 static void mlxsw_sp_router_neigh_rec_process(struct mlxsw_sp *mlxsw_sp,
796 					      char *rauhtd_pl, int rec_index)
797 {
798 	switch (mlxsw_reg_rauhtd_rec_type_get(rauhtd_pl, rec_index)) {
799 	case MLXSW_REG_RAUHTD_TYPE_IPV4:
800 		mlxsw_sp_router_neigh_rec_ipv4_process(mlxsw_sp, rauhtd_pl,
801 						       rec_index);
802 		break;
803 	case MLXSW_REG_RAUHTD_TYPE_IPV6:
804 		WARN_ON_ONCE(1);
805 		break;
806 	}
807 }
808 
809 static bool mlxsw_sp_router_rauhtd_is_full(char *rauhtd_pl)
810 {
811 	u8 num_rec, last_rec_index, num_entries;
812 
813 	num_rec = mlxsw_reg_rauhtd_num_rec_get(rauhtd_pl);
814 	last_rec_index = num_rec - 1;
815 
816 	if (num_rec < MLXSW_REG_RAUHTD_REC_MAX_NUM)
817 		return false;
818 	if (mlxsw_reg_rauhtd_rec_type_get(rauhtd_pl, last_rec_index) ==
819 	    MLXSW_REG_RAUHTD_TYPE_IPV6)
820 		return true;
821 
822 	num_entries = mlxsw_reg_rauhtd_ipv4_rec_num_entries_get(rauhtd_pl,
823 								last_rec_index);
824 	if (++num_entries == MLXSW_REG_RAUHTD_IPV4_ENT_PER_REC)
825 		return true;
826 	return false;
827 }
828 
829 static int mlxsw_sp_router_neighs_update_rauhtd(struct mlxsw_sp *mlxsw_sp)
830 {
831 	char *rauhtd_pl;
832 	u8 num_rec;
833 	int i, err;
834 
835 	rauhtd_pl = kmalloc(MLXSW_REG_RAUHTD_LEN, GFP_KERNEL);
836 	if (!rauhtd_pl)
837 		return -ENOMEM;
838 
839 	/* Make sure the neighbour's netdev isn't removed in the
840 	 * process.
841 	 */
842 	rtnl_lock();
843 	do {
844 		mlxsw_reg_rauhtd_pack(rauhtd_pl, MLXSW_REG_RAUHTD_TYPE_IPV4);
845 		err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(rauhtd),
846 				      rauhtd_pl);
847 		if (err) {
848 			dev_err_ratelimited(mlxsw_sp->bus_info->dev, "Failed to dump neighbour talbe\n");
849 			break;
850 		}
851 		num_rec = mlxsw_reg_rauhtd_num_rec_get(rauhtd_pl);
852 		for (i = 0; i < num_rec; i++)
853 			mlxsw_sp_router_neigh_rec_process(mlxsw_sp, rauhtd_pl,
854 							  i);
855 	} while (mlxsw_sp_router_rauhtd_is_full(rauhtd_pl));
856 	rtnl_unlock();
857 
858 	kfree(rauhtd_pl);
859 	return err;
860 }
861 
862 static void mlxsw_sp_router_neighs_update_nh(struct mlxsw_sp *mlxsw_sp)
863 {
864 	struct mlxsw_sp_neigh_entry *neigh_entry;
865 
866 	/* Take RTNL mutex here to prevent lists from changes */
867 	rtnl_lock();
868 	list_for_each_entry(neigh_entry, &mlxsw_sp->router.nexthop_neighs_list,
869 			    nexthop_neighs_list_node) {
870 		/* If this neigh have nexthops, make the kernel think this neigh
871 		 * is active regardless of the traffic.
872 		 */
873 		if (!list_empty(&neigh_entry->nexthop_list))
874 			neigh_event_send(neigh_entry->key.n, NULL);
875 	}
876 	rtnl_unlock();
877 }
878 
879 static void
880 mlxsw_sp_router_neighs_update_work_schedule(struct mlxsw_sp *mlxsw_sp)
881 {
882 	unsigned long interval = mlxsw_sp->router.neighs_update.interval;
883 
884 	mlxsw_core_schedule_dw(&mlxsw_sp->router.neighs_update.dw,
885 			       msecs_to_jiffies(interval));
886 }
887 
888 static void mlxsw_sp_router_neighs_update_work(struct work_struct *work)
889 {
890 	struct mlxsw_sp *mlxsw_sp = container_of(work, struct mlxsw_sp,
891 						 router.neighs_update.dw.work);
892 	int err;
893 
894 	err = mlxsw_sp_router_neighs_update_rauhtd(mlxsw_sp);
895 	if (err)
896 		dev_err(mlxsw_sp->bus_info->dev, "Could not update kernel for neigh activity");
897 
898 	mlxsw_sp_router_neighs_update_nh(mlxsw_sp);
899 
900 	mlxsw_sp_router_neighs_update_work_schedule(mlxsw_sp);
901 }
902 
903 static void mlxsw_sp_router_probe_unresolved_nexthops(struct work_struct *work)
904 {
905 	struct mlxsw_sp_neigh_entry *neigh_entry;
906 	struct mlxsw_sp *mlxsw_sp = container_of(work, struct mlxsw_sp,
907 						 router.nexthop_probe_dw.work);
908 
909 	/* Iterate over nexthop neighbours, find those who are unresolved and
910 	 * send arp on them. This solves the chicken-egg problem when
911 	 * the nexthop wouldn't get offloaded until the neighbor is resolved
912 	 * but it wouldn't get resolved ever in case traffic is flowing in HW
913 	 * using different nexthop.
914 	 *
915 	 * Take RTNL mutex here to prevent lists from changes.
916 	 */
917 	rtnl_lock();
918 	list_for_each_entry(neigh_entry, &mlxsw_sp->router.nexthop_neighs_list,
919 			    nexthop_neighs_list_node) {
920 		if (!(neigh_entry->key.n->nud_state & NUD_VALID) &&
921 		    !list_empty(&neigh_entry->nexthop_list))
922 			neigh_event_send(neigh_entry->key.n, NULL);
923 	}
924 	rtnl_unlock();
925 
926 	mlxsw_core_schedule_dw(&mlxsw_sp->router.nexthop_probe_dw,
927 			       MLXSW_SP_UNRESOLVED_NH_PROBE_INTERVAL);
928 }
929 
930 static void
931 mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp *mlxsw_sp,
932 			      struct mlxsw_sp_neigh_entry *neigh_entry,
933 			      bool removing);
934 
935 static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work)
936 {
937 	struct mlxsw_sp_neigh_entry *neigh_entry =
938 		container_of(work, struct mlxsw_sp_neigh_entry, dw.work);
939 	struct neighbour *n = neigh_entry->key.n;
940 	struct mlxsw_sp_port *mlxsw_sp_port = neigh_entry->mlxsw_sp_port;
941 	struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
942 	char rauht_pl[MLXSW_REG_RAUHT_LEN];
943 	struct net_device *dev;
944 	bool entry_connected;
945 	u8 nud_state;
946 	bool updating;
947 	bool removing;
948 	bool adding;
949 	u32 dip;
950 	int err;
951 
952 	read_lock_bh(&n->lock);
953 	dip = ntohl(*((__be32 *) n->primary_key));
954 	memcpy(neigh_entry->ha, n->ha, sizeof(neigh_entry->ha));
955 	nud_state = n->nud_state;
956 	dev = n->dev;
957 	read_unlock_bh(&n->lock);
958 
959 	entry_connected = nud_state & NUD_VALID;
960 	adding = (!neigh_entry->offloaded) && entry_connected;
961 	updating = neigh_entry->offloaded && entry_connected;
962 	removing = neigh_entry->offloaded && !entry_connected;
963 
964 	if (adding || updating) {
965 		mlxsw_reg_rauht_pack4(rauht_pl, MLXSW_REG_RAUHT_OP_WRITE_ADD,
966 				      neigh_entry->rif,
967 				      neigh_entry->ha, dip);
968 		err = mlxsw_reg_write(mlxsw_sp->core,
969 				      MLXSW_REG(rauht), rauht_pl);
970 		if (err) {
971 			netdev_err(dev, "Could not add neigh %pI4h\n", &dip);
972 			neigh_entry->offloaded = false;
973 		} else {
974 			neigh_entry->offloaded = true;
975 		}
976 		mlxsw_sp_nexthop_neigh_update(mlxsw_sp, neigh_entry, false);
977 	} else if (removing) {
978 		mlxsw_reg_rauht_pack4(rauht_pl, MLXSW_REG_RAUHT_OP_WRITE_DELETE,
979 				      neigh_entry->rif,
980 				      neigh_entry->ha, dip);
981 		err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rauht),
982 				      rauht_pl);
983 		if (err) {
984 			netdev_err(dev, "Could not delete neigh %pI4h\n", &dip);
985 			neigh_entry->offloaded = true;
986 		} else {
987 			neigh_entry->offloaded = false;
988 		}
989 		mlxsw_sp_nexthop_neigh_update(mlxsw_sp, neigh_entry, true);
990 	}
991 
992 	neigh_release(n);
993 	mlxsw_sp_port_dev_put(mlxsw_sp_port);
994 }
995 
996 int mlxsw_sp_router_netevent_event(struct notifier_block *unused,
997 				   unsigned long event, void *ptr)
998 {
999 	struct mlxsw_sp_neigh_entry *neigh_entry;
1000 	struct mlxsw_sp_port *mlxsw_sp_port;
1001 	struct mlxsw_sp *mlxsw_sp;
1002 	unsigned long interval;
1003 	struct net_device *dev;
1004 	struct neigh_parms *p;
1005 	struct neighbour *n;
1006 	u32 dip;
1007 
1008 	switch (event) {
1009 	case NETEVENT_DELAY_PROBE_TIME_UPDATE:
1010 		p = ptr;
1011 
1012 		/* We don't care about changes in the default table. */
1013 		if (!p->dev || p->tbl != &arp_tbl)
1014 			return NOTIFY_DONE;
1015 
1016 		/* We are in atomic context and can't take RTNL mutex,
1017 		 * so use RCU variant to walk the device chain.
1018 		 */
1019 		mlxsw_sp_port = mlxsw_sp_port_lower_dev_hold(p->dev);
1020 		if (!mlxsw_sp_port)
1021 			return NOTIFY_DONE;
1022 
1023 		mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
1024 		interval = jiffies_to_msecs(NEIGH_VAR(p, DELAY_PROBE_TIME));
1025 		mlxsw_sp->router.neighs_update.interval = interval;
1026 
1027 		mlxsw_sp_port_dev_put(mlxsw_sp_port);
1028 		break;
1029 	case NETEVENT_NEIGH_UPDATE:
1030 		n = ptr;
1031 		dev = n->dev;
1032 
1033 		if (n->tbl != &arp_tbl)
1034 			return NOTIFY_DONE;
1035 
1036 		mlxsw_sp_port = mlxsw_sp_port_lower_dev_hold(dev);
1037 		if (!mlxsw_sp_port)
1038 			return NOTIFY_DONE;
1039 
1040 		mlxsw_sp = mlxsw_sp_port->mlxsw_sp;
1041 		dip = ntohl(*((__be32 *) n->primary_key));
1042 		neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
1043 		if (WARN_ON(!neigh_entry)) {
1044 			mlxsw_sp_port_dev_put(mlxsw_sp_port);
1045 			return NOTIFY_DONE;
1046 		}
1047 		neigh_entry->mlxsw_sp_port = mlxsw_sp_port;
1048 
1049 		/* Take a reference to ensure the neighbour won't be
1050 		 * destructed until we drop the reference in delayed
1051 		 * work.
1052 		 */
1053 		neigh_clone(n);
1054 		if (!mlxsw_core_schedule_dw(&neigh_entry->dw, 0)) {
1055 			neigh_release(n);
1056 			mlxsw_sp_port_dev_put(mlxsw_sp_port);
1057 		}
1058 		break;
1059 	}
1060 
1061 	return NOTIFY_DONE;
1062 }
1063 
1064 static int mlxsw_sp_neigh_init(struct mlxsw_sp *mlxsw_sp)
1065 {
1066 	int err;
1067 
1068 	err = rhashtable_init(&mlxsw_sp->router.neigh_ht,
1069 			      &mlxsw_sp_neigh_ht_params);
1070 	if (err)
1071 		return err;
1072 
1073 	/* Initialize the polling interval according to the default
1074 	 * table.
1075 	 */
1076 	mlxsw_sp_router_neighs_update_interval_init(mlxsw_sp);
1077 
1078 	/* Create the delayed works for the activity_update */
1079 	INIT_DELAYED_WORK(&mlxsw_sp->router.neighs_update.dw,
1080 			  mlxsw_sp_router_neighs_update_work);
1081 	INIT_DELAYED_WORK(&mlxsw_sp->router.nexthop_probe_dw,
1082 			  mlxsw_sp_router_probe_unresolved_nexthops);
1083 	mlxsw_core_schedule_dw(&mlxsw_sp->router.neighs_update.dw, 0);
1084 	mlxsw_core_schedule_dw(&mlxsw_sp->router.nexthop_probe_dw, 0);
1085 	return 0;
1086 }
1087 
1088 static void mlxsw_sp_neigh_fini(struct mlxsw_sp *mlxsw_sp)
1089 {
1090 	cancel_delayed_work_sync(&mlxsw_sp->router.neighs_update.dw);
1091 	cancel_delayed_work_sync(&mlxsw_sp->router.nexthop_probe_dw);
1092 	rhashtable_destroy(&mlxsw_sp->router.neigh_ht);
1093 }
1094 
1095 struct mlxsw_sp_nexthop {
1096 	struct list_head neigh_list_node; /* member of neigh entry list */
1097 	struct mlxsw_sp_nexthop_group *nh_grp; /* pointer back to the group
1098 						* this belongs to
1099 						*/
1100 	u8 should_offload:1, /* set indicates this neigh is connected and
1101 			      * should be put to KVD linear area of this group.
1102 			      */
1103 	   offloaded:1, /* set in case the neigh is actually put into
1104 			 * KVD linear area of this group.
1105 			 */
1106 	   update:1; /* set indicates that MAC of this neigh should be
1107 		      * updated in HW
1108 		      */
1109 	struct mlxsw_sp_neigh_entry *neigh_entry;
1110 };
1111 
1112 struct mlxsw_sp_nexthop_group {
1113 	struct list_head list; /* node in mlxsw->router.nexthop_group_list */
1114 	struct list_head fib_list; /* list of fib entries that use this group */
1115 	u8 adj_index_valid:1;
1116 	u32 adj_index;
1117 	u16 ecmp_size;
1118 	u16 count;
1119 	struct mlxsw_sp_nexthop nexthops[0];
1120 };
1121 
1122 static int mlxsw_sp_adj_index_mass_update_vr(struct mlxsw_sp *mlxsw_sp,
1123 					     struct mlxsw_sp_vr *vr,
1124 					     u32 adj_index, u16 ecmp_size,
1125 					     u32 new_adj_index,
1126 					     u16 new_ecmp_size)
1127 {
1128 	char raleu_pl[MLXSW_REG_RALEU_LEN];
1129 
1130 	mlxsw_reg_raleu_pack(raleu_pl,
1131 			     (enum mlxsw_reg_ralxx_protocol) vr->proto, vr->id,
1132 			     adj_index, ecmp_size, new_adj_index,
1133 			     new_ecmp_size);
1134 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raleu), raleu_pl);
1135 }
1136 
1137 static int mlxsw_sp_adj_index_mass_update(struct mlxsw_sp *mlxsw_sp,
1138 					  struct mlxsw_sp_nexthop_group *nh_grp,
1139 					  u32 old_adj_index, u16 old_ecmp_size)
1140 {
1141 	struct mlxsw_sp_fib_entry *fib_entry;
1142 	struct mlxsw_sp_vr *vr = NULL;
1143 	int err;
1144 
1145 	list_for_each_entry(fib_entry, &nh_grp->fib_list, nexthop_group_node) {
1146 		if (vr == fib_entry->vr)
1147 			continue;
1148 		vr = fib_entry->vr;
1149 		err = mlxsw_sp_adj_index_mass_update_vr(mlxsw_sp, vr,
1150 							old_adj_index,
1151 							old_ecmp_size,
1152 							nh_grp->adj_index,
1153 							nh_grp->ecmp_size);
1154 		if (err)
1155 			return err;
1156 	}
1157 	return 0;
1158 }
1159 
1160 static int mlxsw_sp_nexthop_mac_update(struct mlxsw_sp *mlxsw_sp, u32 adj_index,
1161 				       struct mlxsw_sp_nexthop *nh)
1162 {
1163 	struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry;
1164 	char ratr_pl[MLXSW_REG_RATR_LEN];
1165 
1166 	mlxsw_reg_ratr_pack(ratr_pl, MLXSW_REG_RATR_OP_WRITE_WRITE_ENTRY,
1167 			    true, adj_index, neigh_entry->rif);
1168 	mlxsw_reg_ratr_eth_entry_pack(ratr_pl, neigh_entry->ha);
1169 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ratr), ratr_pl);
1170 }
1171 
1172 static int
1173 mlxsw_sp_nexthop_group_mac_update(struct mlxsw_sp *mlxsw_sp,
1174 				  struct mlxsw_sp_nexthop_group *nh_grp)
1175 {
1176 	u32 adj_index = nh_grp->adj_index; /* base */
1177 	struct mlxsw_sp_nexthop *nh;
1178 	int i;
1179 	int err;
1180 
1181 	for (i = 0; i < nh_grp->count; i++) {
1182 		nh = &nh_grp->nexthops[i];
1183 
1184 		if (!nh->should_offload) {
1185 			nh->offloaded = 0;
1186 			continue;
1187 		}
1188 
1189 		if (nh->update) {
1190 			err = mlxsw_sp_nexthop_mac_update(mlxsw_sp,
1191 							  adj_index, nh);
1192 			if (err)
1193 				return err;
1194 			nh->update = 0;
1195 			nh->offloaded = 1;
1196 		}
1197 		adj_index++;
1198 	}
1199 	return 0;
1200 }
1201 
1202 static int mlxsw_sp_fib_entry_update(struct mlxsw_sp *mlxsw_sp,
1203 				     struct mlxsw_sp_fib_entry *fib_entry);
1204 
1205 static int
1206 mlxsw_sp_nexthop_fib_entries_update(struct mlxsw_sp *mlxsw_sp,
1207 				    struct mlxsw_sp_nexthop_group *nh_grp)
1208 {
1209 	struct mlxsw_sp_fib_entry *fib_entry;
1210 	int err;
1211 
1212 	list_for_each_entry(fib_entry, &nh_grp->fib_list, nexthop_group_node) {
1213 		err = mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry);
1214 		if (err)
1215 			return err;
1216 	}
1217 	return 0;
1218 }
1219 
1220 static void
1221 mlxsw_sp_nexthop_group_refresh(struct mlxsw_sp *mlxsw_sp,
1222 			       struct mlxsw_sp_nexthop_group *nh_grp)
1223 {
1224 	struct mlxsw_sp_nexthop *nh;
1225 	bool offload_change = false;
1226 	u32 adj_index;
1227 	u16 ecmp_size = 0;
1228 	bool old_adj_index_valid;
1229 	u32 old_adj_index;
1230 	u16 old_ecmp_size;
1231 	int ret;
1232 	int i;
1233 	int err;
1234 
1235 	for (i = 0; i < nh_grp->count; i++) {
1236 		nh = &nh_grp->nexthops[i];
1237 
1238 		if (nh->should_offload ^ nh->offloaded) {
1239 			offload_change = true;
1240 			if (nh->should_offload)
1241 				nh->update = 1;
1242 		}
1243 		if (nh->should_offload)
1244 			ecmp_size++;
1245 	}
1246 	if (!offload_change) {
1247 		/* Nothing was added or removed, so no need to reallocate. Just
1248 		 * update MAC on existing adjacency indexes.
1249 		 */
1250 		err = mlxsw_sp_nexthop_group_mac_update(mlxsw_sp, nh_grp);
1251 		if (err) {
1252 			dev_warn(mlxsw_sp->bus_info->dev, "Failed to update neigh MAC in adjacency table.\n");
1253 			goto set_trap;
1254 		}
1255 		return;
1256 	}
1257 	if (!ecmp_size)
1258 		/* No neigh of this group is connected so we just set
1259 		 * the trap and let everthing flow through kernel.
1260 		 */
1261 		goto set_trap;
1262 
1263 	ret = mlxsw_sp_kvdl_alloc(mlxsw_sp, ecmp_size);
1264 	if (ret < 0) {
1265 		/* We ran out of KVD linear space, just set the
1266 		 * trap and let everything flow through kernel.
1267 		 */
1268 		dev_warn(mlxsw_sp->bus_info->dev, "Failed to allocate KVD linear area for nexthop group.\n");
1269 		goto set_trap;
1270 	}
1271 	adj_index = ret;
1272 	old_adj_index_valid = nh_grp->adj_index_valid;
1273 	old_adj_index = nh_grp->adj_index;
1274 	old_ecmp_size = nh_grp->ecmp_size;
1275 	nh_grp->adj_index_valid = 1;
1276 	nh_grp->adj_index = adj_index;
1277 	nh_grp->ecmp_size = ecmp_size;
1278 	err = mlxsw_sp_nexthop_group_mac_update(mlxsw_sp, nh_grp);
1279 	if (err) {
1280 		dev_warn(mlxsw_sp->bus_info->dev, "Failed to update neigh MAC in adjacency table.\n");
1281 		goto set_trap;
1282 	}
1283 
1284 	if (!old_adj_index_valid) {
1285 		/* The trap was set for fib entries, so we have to call
1286 		 * fib entry update to unset it and use adjacency index.
1287 		 */
1288 		err = mlxsw_sp_nexthop_fib_entries_update(mlxsw_sp, nh_grp);
1289 		if (err) {
1290 			dev_warn(mlxsw_sp->bus_info->dev, "Failed to add adjacency index to fib entries.\n");
1291 			goto set_trap;
1292 		}
1293 		return;
1294 	}
1295 
1296 	err = mlxsw_sp_adj_index_mass_update(mlxsw_sp, nh_grp,
1297 					     old_adj_index, old_ecmp_size);
1298 	mlxsw_sp_kvdl_free(mlxsw_sp, old_adj_index);
1299 	if (err) {
1300 		dev_warn(mlxsw_sp->bus_info->dev, "Failed to mass-update adjacency index for nexthop group.\n");
1301 		goto set_trap;
1302 	}
1303 	return;
1304 
1305 set_trap:
1306 	old_adj_index_valid = nh_grp->adj_index_valid;
1307 	nh_grp->adj_index_valid = 0;
1308 	for (i = 0; i < nh_grp->count; i++) {
1309 		nh = &nh_grp->nexthops[i];
1310 		nh->offloaded = 0;
1311 	}
1312 	err = mlxsw_sp_nexthop_fib_entries_update(mlxsw_sp, nh_grp);
1313 	if (err)
1314 		dev_warn(mlxsw_sp->bus_info->dev, "Failed to set traps for fib entries.\n");
1315 	if (old_adj_index_valid)
1316 		mlxsw_sp_kvdl_free(mlxsw_sp, nh_grp->adj_index);
1317 }
1318 
1319 static void __mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp_nexthop *nh,
1320 					    bool removing)
1321 {
1322 	if (!removing && !nh->should_offload)
1323 		nh->should_offload = 1;
1324 	else if (removing && nh->offloaded)
1325 		nh->should_offload = 0;
1326 	nh->update = 1;
1327 }
1328 
1329 static void
1330 mlxsw_sp_nexthop_neigh_update(struct mlxsw_sp *mlxsw_sp,
1331 			      struct mlxsw_sp_neigh_entry *neigh_entry,
1332 			      bool removing)
1333 {
1334 	struct mlxsw_sp_nexthop *nh;
1335 
1336 	/* Take RTNL mutex here to prevent lists from changes */
1337 	rtnl_lock();
1338 	list_for_each_entry(nh, &neigh_entry->nexthop_list,
1339 			    neigh_list_node) {
1340 		__mlxsw_sp_nexthop_neigh_update(nh, removing);
1341 		mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh->nh_grp);
1342 	}
1343 	rtnl_unlock();
1344 }
1345 
1346 static int mlxsw_sp_nexthop_init(struct mlxsw_sp *mlxsw_sp,
1347 				 struct mlxsw_sp_nexthop_group *nh_grp,
1348 				 struct mlxsw_sp_nexthop *nh,
1349 				 struct fib_nh *fib_nh)
1350 {
1351 	struct mlxsw_sp_neigh_entry *neigh_entry;
1352 	struct net_device *dev = fib_nh->nh_dev;
1353 	struct neighbour *n;
1354 	u8 nud_state;
1355 
1356 	/* Take a reference of neigh here ensuring that neigh would
1357 	 * not be detructed before the nexthop entry is finished.
1358 	 * The reference is taken either in neigh_lookup() or
1359 	 * in neith_create() in case n is not found.
1360 	 */
1361 	n = neigh_lookup(&arp_tbl, &fib_nh->nh_gw, dev);
1362 	if (!n) {
1363 		n = neigh_create(&arp_tbl, &fib_nh->nh_gw, dev);
1364 		if (IS_ERR(n))
1365 			return PTR_ERR(n);
1366 		neigh_event_send(n, NULL);
1367 	}
1368 	neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n);
1369 	if (!neigh_entry) {
1370 		neigh_release(n);
1371 		return -EINVAL;
1372 	}
1373 
1374 	/* If that is the first nexthop connected to that neigh, add to
1375 	 * nexthop_neighs_list
1376 	 */
1377 	if (list_empty(&neigh_entry->nexthop_list))
1378 		list_add_tail(&neigh_entry->nexthop_neighs_list_node,
1379 			      &mlxsw_sp->router.nexthop_neighs_list);
1380 
1381 	nh->nh_grp = nh_grp;
1382 	nh->neigh_entry = neigh_entry;
1383 	list_add_tail(&nh->neigh_list_node, &neigh_entry->nexthop_list);
1384 	read_lock_bh(&n->lock);
1385 	nud_state = n->nud_state;
1386 	read_unlock_bh(&n->lock);
1387 	__mlxsw_sp_nexthop_neigh_update(nh, !(nud_state & NUD_VALID));
1388 
1389 	return 0;
1390 }
1391 
1392 static void mlxsw_sp_nexthop_fini(struct mlxsw_sp *mlxsw_sp,
1393 				  struct mlxsw_sp_nexthop *nh)
1394 {
1395 	struct mlxsw_sp_neigh_entry *neigh_entry = nh->neigh_entry;
1396 
1397 	list_del(&nh->neigh_list_node);
1398 
1399 	/* If that is the last nexthop connected to that neigh, remove from
1400 	 * nexthop_neighs_list
1401 	 */
1402 	if (list_empty(&nh->neigh_entry->nexthop_list))
1403 		list_del(&nh->neigh_entry->nexthop_neighs_list_node);
1404 
1405 	neigh_release(neigh_entry->key.n);
1406 }
1407 
1408 static struct mlxsw_sp_nexthop_group *
1409 mlxsw_sp_nexthop_group_create(struct mlxsw_sp *mlxsw_sp, struct fib_info *fi)
1410 {
1411 	struct mlxsw_sp_nexthop_group *nh_grp;
1412 	struct mlxsw_sp_nexthop *nh;
1413 	struct fib_nh *fib_nh;
1414 	size_t alloc_size;
1415 	int i;
1416 	int err;
1417 
1418 	alloc_size = sizeof(*nh_grp) +
1419 		     fi->fib_nhs * sizeof(struct mlxsw_sp_nexthop);
1420 	nh_grp = kzalloc(alloc_size, GFP_KERNEL);
1421 	if (!nh_grp)
1422 		return ERR_PTR(-ENOMEM);
1423 	INIT_LIST_HEAD(&nh_grp->fib_list);
1424 	nh_grp->count = fi->fib_nhs;
1425 	for (i = 0; i < nh_grp->count; i++) {
1426 		nh = &nh_grp->nexthops[i];
1427 		fib_nh = &fi->fib_nh[i];
1428 		err = mlxsw_sp_nexthop_init(mlxsw_sp, nh_grp, nh, fib_nh);
1429 		if (err)
1430 			goto err_nexthop_init;
1431 	}
1432 	list_add_tail(&nh_grp->list, &mlxsw_sp->router.nexthop_group_list);
1433 	mlxsw_sp_nexthop_group_refresh(mlxsw_sp, nh_grp);
1434 	return nh_grp;
1435 
1436 err_nexthop_init:
1437 	for (i--; i >= 0; i--)
1438 		mlxsw_sp_nexthop_fini(mlxsw_sp, nh);
1439 	kfree(nh_grp);
1440 	return ERR_PTR(err);
1441 }
1442 
1443 static void
1444 mlxsw_sp_nexthop_group_destroy(struct mlxsw_sp *mlxsw_sp,
1445 			       struct mlxsw_sp_nexthop_group *nh_grp)
1446 {
1447 	struct mlxsw_sp_nexthop *nh;
1448 	int i;
1449 
1450 	list_del(&nh_grp->list);
1451 	for (i = 0; i < nh_grp->count; i++) {
1452 		nh = &nh_grp->nexthops[i];
1453 		mlxsw_sp_nexthop_fini(mlxsw_sp, nh);
1454 	}
1455 	kfree(nh_grp);
1456 }
1457 
1458 static bool mlxsw_sp_nexthop_match(struct mlxsw_sp_nexthop *nh,
1459 				   struct fib_info *fi)
1460 {
1461 	int i;
1462 
1463 	for (i = 0; i < fi->fib_nhs; i++) {
1464 		struct fib_nh *fib_nh = &fi->fib_nh[i];
1465 		struct neighbour *n = nh->neigh_entry->key.n;
1466 
1467 		if (memcmp(n->primary_key, &fib_nh->nh_gw,
1468 			   sizeof(fib_nh->nh_gw)) == 0 &&
1469 		    n->dev == fib_nh->nh_dev)
1470 			return true;
1471 	}
1472 	return false;
1473 }
1474 
1475 static bool mlxsw_sp_nexthop_group_match(struct mlxsw_sp_nexthop_group *nh_grp,
1476 					 struct fib_info *fi)
1477 {
1478 	int i;
1479 
1480 	if (nh_grp->count != fi->fib_nhs)
1481 		return false;
1482 	for (i = 0; i < nh_grp->count; i++) {
1483 		struct mlxsw_sp_nexthop *nh = &nh_grp->nexthops[i];
1484 
1485 		if (!mlxsw_sp_nexthop_match(nh, fi))
1486 			return false;
1487 	}
1488 	return true;
1489 }
1490 
1491 static struct mlxsw_sp_nexthop_group *
1492 mlxsw_sp_nexthop_group_find(struct mlxsw_sp *mlxsw_sp, struct fib_info *fi)
1493 {
1494 	struct mlxsw_sp_nexthop_group *nh_grp;
1495 
1496 	list_for_each_entry(nh_grp, &mlxsw_sp->router.nexthop_group_list,
1497 			    list) {
1498 		if (mlxsw_sp_nexthop_group_match(nh_grp, fi))
1499 			return nh_grp;
1500 	}
1501 	return NULL;
1502 }
1503 
1504 static int mlxsw_sp_nexthop_group_get(struct mlxsw_sp *mlxsw_sp,
1505 				      struct mlxsw_sp_fib_entry *fib_entry,
1506 				      struct fib_info *fi)
1507 {
1508 	struct mlxsw_sp_nexthop_group *nh_grp;
1509 
1510 	nh_grp = mlxsw_sp_nexthop_group_find(mlxsw_sp, fi);
1511 	if (!nh_grp) {
1512 		nh_grp = mlxsw_sp_nexthop_group_create(mlxsw_sp, fi);
1513 		if (IS_ERR(nh_grp))
1514 			return PTR_ERR(nh_grp);
1515 	}
1516 	list_add_tail(&fib_entry->nexthop_group_node, &nh_grp->fib_list);
1517 	fib_entry->nh_group = nh_grp;
1518 	return 0;
1519 }
1520 
1521 static void mlxsw_sp_nexthop_group_put(struct mlxsw_sp *mlxsw_sp,
1522 				       struct mlxsw_sp_fib_entry *fib_entry)
1523 {
1524 	struct mlxsw_sp_nexthop_group *nh_grp = fib_entry->nh_group;
1525 
1526 	list_del(&fib_entry->nexthop_group_node);
1527 	if (!list_empty(&nh_grp->fib_list))
1528 		return;
1529 	mlxsw_sp_nexthop_group_destroy(mlxsw_sp, nh_grp);
1530 }
1531 
1532 static int mlxsw_sp_fib_entry_op4_remote(struct mlxsw_sp *mlxsw_sp,
1533 					 struct mlxsw_sp_fib_entry *fib_entry,
1534 					 enum mlxsw_reg_ralue_op op)
1535 {
1536 	char ralue_pl[MLXSW_REG_RALUE_LEN];
1537 	u32 *p_dip = (u32 *) fib_entry->key.addr;
1538 	struct mlxsw_sp_vr *vr = fib_entry->vr;
1539 	enum mlxsw_reg_ralue_trap_action trap_action;
1540 	u16 trap_id = 0;
1541 	u32 adjacency_index = 0;
1542 	u16 ecmp_size = 0;
1543 
1544 	/* In case the nexthop group adjacency index is valid, use it
1545 	 * with provided ECMP size. Otherwise, setup trap and pass
1546 	 * traffic to kernel.
1547 	 */
1548 	if (fib_entry->nh_group->adj_index_valid) {
1549 		trap_action = MLXSW_REG_RALUE_TRAP_ACTION_NOP;
1550 		adjacency_index = fib_entry->nh_group->adj_index;
1551 		ecmp_size = fib_entry->nh_group->ecmp_size;
1552 	} else {
1553 		trap_action = MLXSW_REG_RALUE_TRAP_ACTION_TRAP;
1554 		trap_id = MLXSW_TRAP_ID_RTR_INGRESS0;
1555 	}
1556 
1557 	mlxsw_reg_ralue_pack4(ralue_pl,
1558 			      (enum mlxsw_reg_ralxx_protocol) vr->proto, op,
1559 			      vr->id, fib_entry->key.prefix_len, *p_dip);
1560 	mlxsw_reg_ralue_act_remote_pack(ralue_pl, trap_action, trap_id,
1561 					adjacency_index, ecmp_size);
1562 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
1563 }
1564 
1565 static int mlxsw_sp_fib_entry_op4_local(struct mlxsw_sp *mlxsw_sp,
1566 					struct mlxsw_sp_fib_entry *fib_entry,
1567 					enum mlxsw_reg_ralue_op op)
1568 {
1569 	char ralue_pl[MLXSW_REG_RALUE_LEN];
1570 	u32 *p_dip = (u32 *) fib_entry->key.addr;
1571 	struct mlxsw_sp_vr *vr = fib_entry->vr;
1572 
1573 	mlxsw_reg_ralue_pack4(ralue_pl,
1574 			      (enum mlxsw_reg_ralxx_protocol) vr->proto, op,
1575 			      vr->id, fib_entry->key.prefix_len, *p_dip);
1576 	mlxsw_reg_ralue_act_local_pack(ralue_pl,
1577 				       MLXSW_REG_RALUE_TRAP_ACTION_NOP, 0,
1578 				       fib_entry->rif);
1579 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
1580 }
1581 
1582 static int mlxsw_sp_fib_entry_op4_trap(struct mlxsw_sp *mlxsw_sp,
1583 				       struct mlxsw_sp_fib_entry *fib_entry,
1584 				       enum mlxsw_reg_ralue_op op)
1585 {
1586 	char ralue_pl[MLXSW_REG_RALUE_LEN];
1587 	u32 *p_dip = (u32 *) fib_entry->key.addr;
1588 	struct mlxsw_sp_vr *vr = fib_entry->vr;
1589 
1590 	mlxsw_reg_ralue_pack4(ralue_pl,
1591 			      (enum mlxsw_reg_ralxx_protocol) vr->proto, op,
1592 			      vr->id, fib_entry->key.prefix_len, *p_dip);
1593 	mlxsw_reg_ralue_act_ip2me_pack(ralue_pl);
1594 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
1595 }
1596 
1597 static int mlxsw_sp_fib_entry_op4(struct mlxsw_sp *mlxsw_sp,
1598 				  struct mlxsw_sp_fib_entry *fib_entry,
1599 				  enum mlxsw_reg_ralue_op op)
1600 {
1601 	switch (fib_entry->type) {
1602 	case MLXSW_SP_FIB_ENTRY_TYPE_REMOTE:
1603 		return mlxsw_sp_fib_entry_op4_remote(mlxsw_sp, fib_entry, op);
1604 	case MLXSW_SP_FIB_ENTRY_TYPE_LOCAL:
1605 		return mlxsw_sp_fib_entry_op4_local(mlxsw_sp, fib_entry, op);
1606 	case MLXSW_SP_FIB_ENTRY_TYPE_TRAP:
1607 		return mlxsw_sp_fib_entry_op4_trap(mlxsw_sp, fib_entry, op);
1608 	}
1609 	return -EINVAL;
1610 }
1611 
1612 static int mlxsw_sp_fib_entry_op(struct mlxsw_sp *mlxsw_sp,
1613 				 struct mlxsw_sp_fib_entry *fib_entry,
1614 				 enum mlxsw_reg_ralue_op op)
1615 {
1616 	switch (fib_entry->vr->proto) {
1617 	case MLXSW_SP_L3_PROTO_IPV4:
1618 		return mlxsw_sp_fib_entry_op4(mlxsw_sp, fib_entry, op);
1619 	case MLXSW_SP_L3_PROTO_IPV6:
1620 		return -EINVAL;
1621 	}
1622 	return -EINVAL;
1623 }
1624 
1625 static int mlxsw_sp_fib_entry_update(struct mlxsw_sp *mlxsw_sp,
1626 				     struct mlxsw_sp_fib_entry *fib_entry)
1627 {
1628 	return mlxsw_sp_fib_entry_op(mlxsw_sp, fib_entry,
1629 				     MLXSW_REG_RALUE_OP_WRITE_WRITE);
1630 }
1631 
1632 static int mlxsw_sp_fib_entry_del(struct mlxsw_sp *mlxsw_sp,
1633 				  struct mlxsw_sp_fib_entry *fib_entry)
1634 {
1635 	return mlxsw_sp_fib_entry_op(mlxsw_sp, fib_entry,
1636 				     MLXSW_REG_RALUE_OP_WRITE_DELETE);
1637 }
1638 
1639 static int
1640 mlxsw_sp_router_fib4_entry_init(struct mlxsw_sp *mlxsw_sp,
1641 				const struct fib_entry_notifier_info *fen_info,
1642 				struct mlxsw_sp_fib_entry *fib_entry)
1643 {
1644 	struct fib_info *fi = fen_info->fi;
1645 	struct mlxsw_sp_rif *r = NULL;
1646 	int nhsel;
1647 	int err;
1648 
1649 	if (fen_info->type == RTN_LOCAL || fen_info->type == RTN_BROADCAST) {
1650 		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
1651 		return 0;
1652 	}
1653 	if (fen_info->type != RTN_UNICAST)
1654 		return -EINVAL;
1655 
1656 	for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
1657 		const struct fib_nh *nh = &fi->fib_nh[nhsel];
1658 
1659 		if (!nh->nh_dev)
1660 			continue;
1661 		r = mlxsw_sp_rif_find_by_dev(mlxsw_sp, nh->nh_dev);
1662 		if (!r) {
1663 			/* In case router interface is not found for
1664 			 * at least one of the nexthops, that means
1665 			 * the nexthop points to some device unrelated
1666 			 * to us. Set trap and pass the packets for
1667 			 * this prefix to kernel.
1668 			 */
1669 			break;
1670 		}
1671 	}
1672 
1673 	if (!r) {
1674 		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_TRAP;
1675 		return 0;
1676 	}
1677 
1678 	if (fi->fib_scope != RT_SCOPE_UNIVERSE) {
1679 		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_LOCAL;
1680 		fib_entry->rif = r->rif;
1681 	} else {
1682 		fib_entry->type = MLXSW_SP_FIB_ENTRY_TYPE_REMOTE;
1683 		err = mlxsw_sp_nexthop_group_get(mlxsw_sp, fib_entry, fi);
1684 		if (err)
1685 			return err;
1686 	}
1687 	fib_info_offload_inc(fen_info->fi);
1688 	return 0;
1689 }
1690 
1691 static void
1692 mlxsw_sp_router_fib4_entry_fini(struct mlxsw_sp *mlxsw_sp,
1693 				struct mlxsw_sp_fib_entry *fib_entry)
1694 {
1695 	if (fib_entry->type != MLXSW_SP_FIB_ENTRY_TYPE_TRAP)
1696 		fib_info_offload_dec(fib_entry->fi);
1697 	if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_REMOTE)
1698 		mlxsw_sp_nexthop_group_put(mlxsw_sp, fib_entry);
1699 }
1700 
1701 static struct mlxsw_sp_fib_entry *
1702 mlxsw_sp_fib_entry_get(struct mlxsw_sp *mlxsw_sp,
1703 		       const struct fib_entry_notifier_info *fen_info)
1704 {
1705 	struct mlxsw_sp_fib_entry *fib_entry;
1706 	struct fib_info *fi = fen_info->fi;
1707 	struct mlxsw_sp_vr *vr;
1708 	int err;
1709 
1710 	vr = mlxsw_sp_vr_get(mlxsw_sp, fen_info->dst_len, fen_info->tb_id,
1711 			     MLXSW_SP_L3_PROTO_IPV4);
1712 	if (IS_ERR(vr))
1713 		return ERR_CAST(vr);
1714 
1715 	fib_entry = mlxsw_sp_fib_entry_lookup(vr->fib, &fen_info->dst,
1716 					      sizeof(fen_info->dst),
1717 					      fen_info->dst_len, fi->fib_dev);
1718 	if (fib_entry) {
1719 		/* Already exists, just take a reference */
1720 		fib_entry->ref_count++;
1721 		return fib_entry;
1722 	}
1723 	fib_entry = mlxsw_sp_fib_entry_create(vr->fib, &fen_info->dst,
1724 					      sizeof(fen_info->dst),
1725 					      fen_info->dst_len, fi->fib_dev);
1726 	if (!fib_entry) {
1727 		err = -ENOMEM;
1728 		goto err_fib_entry_create;
1729 	}
1730 	fib_entry->vr = vr;
1731 	fib_entry->fi = fi;
1732 	fib_entry->ref_count = 1;
1733 
1734 	err = mlxsw_sp_router_fib4_entry_init(mlxsw_sp, fen_info, fib_entry);
1735 	if (err)
1736 		goto err_fib4_entry_init;
1737 
1738 	return fib_entry;
1739 
1740 err_fib4_entry_init:
1741 	mlxsw_sp_fib_entry_destroy(fib_entry);
1742 err_fib_entry_create:
1743 	mlxsw_sp_vr_put(mlxsw_sp, vr);
1744 
1745 	return ERR_PTR(err);
1746 }
1747 
1748 static struct mlxsw_sp_fib_entry *
1749 mlxsw_sp_fib_entry_find(struct mlxsw_sp *mlxsw_sp,
1750 			const struct fib_entry_notifier_info *fen_info)
1751 {
1752 	struct mlxsw_sp_vr *vr;
1753 
1754 	vr = mlxsw_sp_vr_find(mlxsw_sp, fen_info->tb_id,
1755 			      MLXSW_SP_L3_PROTO_IPV4);
1756 	if (!vr)
1757 		return NULL;
1758 
1759 	return mlxsw_sp_fib_entry_lookup(vr->fib, &fen_info->dst,
1760 					 sizeof(fen_info->dst),
1761 					 fen_info->dst_len,
1762 					 fen_info->fi->fib_dev);
1763 }
1764 
1765 static void mlxsw_sp_fib_entry_put(struct mlxsw_sp *mlxsw_sp,
1766 				   struct mlxsw_sp_fib_entry *fib_entry)
1767 {
1768 	struct mlxsw_sp_vr *vr = fib_entry->vr;
1769 
1770 	if (--fib_entry->ref_count == 0) {
1771 		mlxsw_sp_router_fib4_entry_fini(mlxsw_sp, fib_entry);
1772 		mlxsw_sp_fib_entry_destroy(fib_entry);
1773 	}
1774 	mlxsw_sp_vr_put(mlxsw_sp, vr);
1775 }
1776 
1777 static void mlxsw_sp_fib_entry_put_all(struct mlxsw_sp *mlxsw_sp,
1778 				       struct mlxsw_sp_fib_entry *fib_entry)
1779 {
1780 	unsigned int last_ref_count;
1781 
1782 	do {
1783 		last_ref_count = fib_entry->ref_count;
1784 		mlxsw_sp_fib_entry_put(mlxsw_sp, fib_entry);
1785 	} while (last_ref_count != 1);
1786 }
1787 
1788 static int mlxsw_sp_router_fib4_add(struct mlxsw_sp *mlxsw_sp,
1789 				    struct fib_entry_notifier_info *fen_info)
1790 {
1791 	struct mlxsw_sp_fib_entry *fib_entry;
1792 	struct mlxsw_sp_vr *vr;
1793 	int err;
1794 
1795 	if (mlxsw_sp->router.aborted)
1796 		return 0;
1797 
1798 	fib_entry = mlxsw_sp_fib_entry_get(mlxsw_sp, fen_info);
1799 	if (IS_ERR(fib_entry)) {
1800 		dev_warn(mlxsw_sp->bus_info->dev, "Failed to get FIB4 entry being added.\n");
1801 		return PTR_ERR(fib_entry);
1802 	}
1803 
1804 	if (fib_entry->ref_count != 1)
1805 		return 0;
1806 
1807 	vr = fib_entry->vr;
1808 	err = mlxsw_sp_fib_entry_insert(vr->fib, fib_entry);
1809 	if (err) {
1810 		dev_warn(mlxsw_sp->bus_info->dev, "Failed to insert FIB4 entry being added.\n");
1811 		goto err_fib_entry_insert;
1812 	}
1813 	err = mlxsw_sp_fib_entry_update(mlxsw_sp, fib_entry);
1814 	if (err)
1815 		goto err_fib_entry_add;
1816 	return 0;
1817 
1818 err_fib_entry_add:
1819 	mlxsw_sp_fib_entry_remove(vr->fib, fib_entry);
1820 err_fib_entry_insert:
1821 	mlxsw_sp_fib_entry_put(mlxsw_sp, fib_entry);
1822 	return err;
1823 }
1824 
1825 static void mlxsw_sp_router_fib4_del(struct mlxsw_sp *mlxsw_sp,
1826 				     struct fib_entry_notifier_info *fen_info)
1827 {
1828 	struct mlxsw_sp_fib_entry *fib_entry;
1829 
1830 	if (mlxsw_sp->router.aborted)
1831 		return;
1832 
1833 	fib_entry = mlxsw_sp_fib_entry_find(mlxsw_sp, fen_info);
1834 	if (!fib_entry)
1835 		return;
1836 
1837 	if (fib_entry->ref_count == 1) {
1838 		mlxsw_sp_fib_entry_del(mlxsw_sp, fib_entry);
1839 		mlxsw_sp_fib_entry_remove(fib_entry->vr->fib, fib_entry);
1840 	}
1841 
1842 	mlxsw_sp_fib_entry_put(mlxsw_sp, fib_entry);
1843 }
1844 
1845 static int mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp)
1846 {
1847 	char ralta_pl[MLXSW_REG_RALTA_LEN];
1848 	char ralst_pl[MLXSW_REG_RALST_LEN];
1849 	char raltb_pl[MLXSW_REG_RALTB_LEN];
1850 	char ralue_pl[MLXSW_REG_RALUE_LEN];
1851 	int err;
1852 
1853 	mlxsw_reg_ralta_pack(ralta_pl, true, MLXSW_REG_RALXX_PROTOCOL_IPV4,
1854 			     MLXSW_SP_LPM_TREE_MIN);
1855 	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralta), ralta_pl);
1856 	if (err)
1857 		return err;
1858 
1859 	mlxsw_reg_ralst_pack(ralst_pl, 0xff, MLXSW_SP_LPM_TREE_MIN);
1860 	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralst), ralst_pl);
1861 	if (err)
1862 		return err;
1863 
1864 	mlxsw_reg_raltb_pack(raltb_pl, 0, MLXSW_REG_RALXX_PROTOCOL_IPV4,
1865 			     MLXSW_SP_LPM_TREE_MIN);
1866 	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(raltb), raltb_pl);
1867 	if (err)
1868 		return err;
1869 
1870 	mlxsw_reg_ralue_pack4(ralue_pl, MLXSW_SP_L3_PROTO_IPV4,
1871 			      MLXSW_REG_RALUE_OP_WRITE_WRITE, 0, 0, 0);
1872 	mlxsw_reg_ralue_act_ip2me_pack(ralue_pl);
1873 	return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl);
1874 }
1875 
1876 static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp)
1877 {
1878 	struct mlxsw_sp_fib_entry *fib_entry;
1879 	struct mlxsw_sp_fib_entry *tmp;
1880 	struct mlxsw_sp_vr *vr;
1881 	int i;
1882 
1883 	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_VRS); i++) {
1884 		vr = &mlxsw_sp->router.vrs[i];
1885 
1886 		if (!vr->used)
1887 			continue;
1888 
1889 		list_for_each_entry_safe(fib_entry, tmp,
1890 					 &vr->fib->entry_list, list) {
1891 			bool do_break = &tmp->list == &vr->fib->entry_list;
1892 
1893 			mlxsw_sp_fib_entry_del(mlxsw_sp, fib_entry);
1894 			mlxsw_sp_fib_entry_remove(fib_entry->vr->fib,
1895 						  fib_entry);
1896 			mlxsw_sp_fib_entry_put_all(mlxsw_sp, fib_entry);
1897 			if (do_break)
1898 				break;
1899 		}
1900 	}
1901 }
1902 
1903 static void mlxsw_sp_router_fib4_abort(struct mlxsw_sp *mlxsw_sp)
1904 {
1905 	int err;
1906 
1907 	if (mlxsw_sp->router.aborted)
1908 		return;
1909 	dev_warn(mlxsw_sp->bus_info->dev, "FIB abort triggered. Note that FIB entries are no longer being offloaded to this device.\n");
1910 	mlxsw_sp_router_fib_flush(mlxsw_sp);
1911 	mlxsw_sp->router.aborted = true;
1912 	err = mlxsw_sp_router_set_abort_trap(mlxsw_sp);
1913 	if (err)
1914 		dev_warn(mlxsw_sp->bus_info->dev, "Failed to set abort trap.\n");
1915 }
1916 
1917 static int __mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
1918 {
1919 	char rgcr_pl[MLXSW_REG_RGCR_LEN];
1920 	u64 max_rifs;
1921 	int err;
1922 
1923 	if (!MLXSW_CORE_RES_VALID(mlxsw_sp->core, MAX_RIFS))
1924 		return -EIO;
1925 
1926 	max_rifs = MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS);
1927 	mlxsw_sp->rifs = kcalloc(max_rifs, sizeof(struct mlxsw_sp_rif *),
1928 				 GFP_KERNEL);
1929 	if (!mlxsw_sp->rifs)
1930 		return -ENOMEM;
1931 
1932 	mlxsw_reg_rgcr_pack(rgcr_pl, true);
1933 	mlxsw_reg_rgcr_max_router_interfaces_set(rgcr_pl, max_rifs);
1934 	err = mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rgcr), rgcr_pl);
1935 	if (err)
1936 		goto err_rgcr_fail;
1937 
1938 	return 0;
1939 
1940 err_rgcr_fail:
1941 	kfree(mlxsw_sp->rifs);
1942 	return err;
1943 }
1944 
1945 static void __mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
1946 {
1947 	char rgcr_pl[MLXSW_REG_RGCR_LEN];
1948 	int i;
1949 
1950 	mlxsw_reg_rgcr_pack(rgcr_pl, false);
1951 	mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(rgcr), rgcr_pl);
1952 
1953 	for (i = 0; i < MLXSW_CORE_RES_GET(mlxsw_sp->core, MAX_RIFS); i++)
1954 		WARN_ON_ONCE(mlxsw_sp->rifs[i]);
1955 
1956 	kfree(mlxsw_sp->rifs);
1957 }
1958 
1959 struct mlxsw_sp_fib_event_work {
1960 	struct delayed_work dw;
1961 	struct fib_entry_notifier_info fen_info;
1962 	struct mlxsw_sp *mlxsw_sp;
1963 	unsigned long event;
1964 };
1965 
1966 static void mlxsw_sp_router_fib_event_work(struct work_struct *work)
1967 {
1968 	struct mlxsw_sp_fib_event_work *fib_work =
1969 		container_of(work, struct mlxsw_sp_fib_event_work, dw.work);
1970 	struct mlxsw_sp *mlxsw_sp = fib_work->mlxsw_sp;
1971 	int err;
1972 
1973 	/* Protect internal structures from changes */
1974 	rtnl_lock();
1975 	switch (fib_work->event) {
1976 	case FIB_EVENT_ENTRY_ADD:
1977 		err = mlxsw_sp_router_fib4_add(mlxsw_sp, &fib_work->fen_info);
1978 		if (err)
1979 			mlxsw_sp_router_fib4_abort(mlxsw_sp);
1980 		fib_info_put(fib_work->fen_info.fi);
1981 		break;
1982 	case FIB_EVENT_ENTRY_DEL:
1983 		mlxsw_sp_router_fib4_del(mlxsw_sp, &fib_work->fen_info);
1984 		fib_info_put(fib_work->fen_info.fi);
1985 		break;
1986 	case FIB_EVENT_RULE_ADD: /* fall through */
1987 	case FIB_EVENT_RULE_DEL:
1988 		mlxsw_sp_router_fib4_abort(mlxsw_sp);
1989 		break;
1990 	}
1991 	rtnl_unlock();
1992 	kfree(fib_work);
1993 }
1994 
1995 /* Called with rcu_read_lock() */
1996 static int mlxsw_sp_router_fib_event(struct notifier_block *nb,
1997 				     unsigned long event, void *ptr)
1998 {
1999 	struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
2000 	struct mlxsw_sp_fib_event_work *fib_work;
2001 	struct fib_notifier_info *info = ptr;
2002 
2003 	if (!net_eq(info->net, &init_net))
2004 		return NOTIFY_DONE;
2005 
2006 	fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);
2007 	if (WARN_ON(!fib_work))
2008 		return NOTIFY_BAD;
2009 
2010 	INIT_DELAYED_WORK(&fib_work->dw, mlxsw_sp_router_fib_event_work);
2011 	fib_work->mlxsw_sp = mlxsw_sp;
2012 	fib_work->event = event;
2013 
2014 	switch (event) {
2015 	case FIB_EVENT_ENTRY_ADD: /* fall through */
2016 	case FIB_EVENT_ENTRY_DEL:
2017 		memcpy(&fib_work->fen_info, ptr, sizeof(fib_work->fen_info));
2018 		/* Take referece on fib_info to prevent it from being
2019 		 * freed while work is queued. Release it afterwards.
2020 		 */
2021 		fib_info_hold(fib_work->fen_info.fi);
2022 		break;
2023 	}
2024 
2025 	mlxsw_core_schedule_odw(&fib_work->dw, 0);
2026 
2027 	return NOTIFY_DONE;
2028 }
2029 
2030 static void mlxsw_sp_router_fib_dump_flush(struct notifier_block *nb)
2031 {
2032 	struct mlxsw_sp *mlxsw_sp = container_of(nb, struct mlxsw_sp, fib_nb);
2033 
2034 	/* Flush pending FIB notifications and then flush the device's
2035 	 * table before requesting another dump. The FIB notification
2036 	 * block is unregistered, so no need to take RTNL.
2037 	 */
2038 	mlxsw_core_flush_owq();
2039 	mlxsw_sp_router_fib_flush(mlxsw_sp);
2040 }
2041 
2042 int mlxsw_sp_router_init(struct mlxsw_sp *mlxsw_sp)
2043 {
2044 	int err;
2045 
2046 	INIT_LIST_HEAD(&mlxsw_sp->router.nexthop_neighs_list);
2047 	INIT_LIST_HEAD(&mlxsw_sp->router.nexthop_group_list);
2048 	err = __mlxsw_sp_router_init(mlxsw_sp);
2049 	if (err)
2050 		return err;
2051 
2052 	mlxsw_sp_lpm_init(mlxsw_sp);
2053 	err = mlxsw_sp_vrs_init(mlxsw_sp);
2054 	if (err)
2055 		goto err_vrs_init;
2056 
2057 	err = mlxsw_sp_neigh_init(mlxsw_sp);
2058 	if (err)
2059 		goto err_neigh_init;
2060 
2061 	mlxsw_sp->fib_nb.notifier_call = mlxsw_sp_router_fib_event;
2062 	err = register_fib_notifier(&mlxsw_sp->fib_nb,
2063 				    mlxsw_sp_router_fib_dump_flush);
2064 	if (err)
2065 		goto err_register_fib_notifier;
2066 
2067 	return 0;
2068 
2069 err_register_fib_notifier:
2070 	mlxsw_sp_neigh_fini(mlxsw_sp);
2071 err_neigh_init:
2072 	mlxsw_sp_vrs_fini(mlxsw_sp);
2073 err_vrs_init:
2074 	__mlxsw_sp_router_fini(mlxsw_sp);
2075 	return err;
2076 }
2077 
2078 void mlxsw_sp_router_fini(struct mlxsw_sp *mlxsw_sp)
2079 {
2080 	unregister_fib_notifier(&mlxsw_sp->fib_nb);
2081 	mlxsw_sp_neigh_fini(mlxsw_sp);
2082 	mlxsw_sp_vrs_fini(mlxsw_sp);
2083 	__mlxsw_sp_router_fini(mlxsw_sp);
2084 }
2085