xref: /titanic_51/usr/src/uts/common/inet/ip_ire.h (revision e642872b5a76c4c8654eaa68f64986a85c86ca37)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #ifndef	_INET_IP_IRE_H
28 #define	_INET_IP_IRE_H
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #ifdef	__cplusplus
33 extern "C" {
34 #endif
35 
36 #define	IPV6_LL_PREFIXLEN	10	/* Number of bits in link-local pref */
37 
38 #define	IP_CACHE_TABLE_SIZE	256
39 #define	IP_MRTUN_TABLE_SIZE	256	/* Mobile IP reverse tunnel table */
40 					/* size. Only used by mipagent */
41 #define	IP_SRCIF_TABLE_SIZE	256	/* Per interface routing table size */
42 #define	IP_MASK_TABLE_SIZE	(IP_ABITS + 1)		/* 33 ptrs */
43 
44 #define	IP6_FTABLE_HASH_SIZE	32	/* size of each hash table in ptrs */
45 #define	IP6_CACHE_TABLE_SIZE	256
46 #define	IP6_MASK_TABLE_SIZE	(IPV6_ABITS + 1)	/* 129 ptrs */
47 
48 /*
49  * We use the common modulo hash function.  In ip_ire_init(), we make
50  * sure that the cache table size is always a power of 2.  That's why
51  * we can use & instead of %.  Also note that we try hard to make sure
52  * the lower bits of an address capture most info from the whole address.
53  * The reason being that since our hash table is probably a lot smaller
54  * than 2^32 buckets so the lower bits are the most important.
55  */
56 #define	IRE_ADDR_HASH(addr, table_size) \
57 	(((addr) ^ ((addr) >> 8) ^ ((addr) >> 16) ^ ((addr) >> 24)) &	\
58 	((table_size) - 1))
59 
60 /*
61  * Exclusive-or those bytes that are likely to contain the MAC
62  * address.  Assumes EUI-64 format for good hashing.
63  */
64 #define	IRE_ADDR_HASH_V6(addr, table_size) 				\
65 	(((addr).s6_addr32[3] ^						\
66 	(((addr).s6_addr32[3] ^ (addr).s6_addr32[2]) >> 12)) &		\
67 	((table_size) - 1))
68 /* This assumes that the ftable size is a power of 2. */
69 #define	IRE_ADDR_MASK_HASH_V6(addr, mask, table_size) 			\
70 	((((addr).s6_addr8[8] & (mask).s6_addr8[8]) ^ 			\
71 	((addr).s6_addr8[9] & (mask).s6_addr8[9]) ^			\
72 	((addr).s6_addr8[10] & (mask).s6_addr8[10]) ^ 			\
73 	((addr).s6_addr8[13] & (mask).s6_addr8[13]) ^ 			\
74 	((addr).s6_addr8[14] & (mask).s6_addr8[14]) ^ 			\
75 	((addr).s6_addr8[15] & (mask).s6_addr8[15])) & ((table_size) - 1))
76 
77 /*
78  * match parameter definitions for
79  * IRE lookup routines.
80  */
81 #define	MATCH_IRE_DSTONLY	0x0000	/* Match just the address */
82 #define	MATCH_IRE_TYPE		0x0001	/* Match IRE type */
83 #define	MATCH_IRE_SRC		0x0002	/* Match IRE source address */
84 #define	MATCH_IRE_MASK		0x0004	/* Match IRE mask */
85 #define	MATCH_IRE_WQ		0x0008	/* Match IRE Write Q */
86 #define	MATCH_IRE_GW		0x0010	/* Match IRE gateway */
87 #define	MATCH_IRE_IPIF		0x0020	/* Match IRE ipif */
88 #define	MATCH_IRE_RECURSIVE	0x0040	/* Do recursive lookup if necessary */
89 #define	MATCH_IRE_DEFAULT	0x0080	/* Return default route if no route */
90 					/* found. */
91 #define	MATCH_IRE_RJ_BHOLE	0x0100	/* During lookup if we hit an ire */
92 					/* with RTF_REJECT or RTF_BLACKHOLE, */
93 					/* return the ire. No recursive */
94 					/* lookup should be done. */
95 #define	MATCH_IRE_IHANDLE	0x0200	/* Match IRE on ihandle */
96 #define	MATCH_IRE_MARK_HIDDEN	0x0400	/* Match IRE ire_marks with */
97 					/* IRE_MARK_HIDDEN. */
98 /*
99  * MATCH_IRE_ILL is used whenever we want to specifically match an IRE
100  * whose ire_ipif->ipif_ill or (ill_t *)ire_stq->q_ptr matches a given
101  * ill. When MATCH_IRE_ILL is used to locate an IRE_CACHE, it implies
102  * that the packet will not be load balanced. This is normally used
103  * by in.mpathd to send out failure detection probes.
104  *
105  * MATCH_IRE_ILL_GROUP is used whenever we are not specific about which
106  * interface (ill) the packet should be sent out. This implies that the
107  * packets will be subjected to load balancing and it might go out on
108  * any interface in the group. When there is only interface in the group,
109  * MATCH_IRE_ILL_GROUP becomes MATCH_IRE_ILL. Most of the code uses
110  * MATCH_IRE_ILL_GROUP and MATCH_IRE_ILL is used in very few cases where
111  * we want to disable load balancing.
112  *
113  * MATCH_IRE_PARENT is used whenever we unconditionally want to get the
114  * parent IRE (sire) while recursively searching IREs for an offsubnet
115  * destination. With this flag, even if no IRE_CACHETABLE or IRE_INTERFACE
116  * is found to help resolving IRE_OFFSUBNET in lookup routines, the
117  * IRE_OFFSUBNET sire, if any, is returned to the caller.
118  */
119 #define	MATCH_IRE_ILL_GROUP	0x0800	/* Match IRE on ill or the ill_group. */
120 #define	MATCH_IRE_ILL		0x1000	/* Match IRE on the ill only */
121 
122 #define	MATCH_IRE_PARENT	0x2000	/* Match parent ire, if any, */
123 					/* even if ire is not matched. */
124 #define	MATCH_IRE_ZONEONLY	0x4000	/* Match IREs in specified zone, ie */
125 					/* don't match IRE_LOCALs from other */
126 					/* zones or shared IREs */
127 #define	MATCH_IRE_MARK_PRIVATE_ADDR	0x8000	/* Match IRE ire_marks with */
128 						/* IRE_MARK_PRIVATE_ADDR. */
129 #define	MATCH_IRE_SECATTR	0x10000	/* Match gateway security attributes */
130 #define	MATCH_IRE_COMPLETE	0x20000	/* ire_ftable_lookup() can return */
131 					/* IRE_CACHE entry only if it is  */
132 					/* ND_REACHABLE			  */
133 
134 /*
135  * Any ire to nce association is long term, and
136  * the refhold and refrele may be done by different
137  * threads. So all cases of making or breaking ire to
138  * nce association should all effectively use the NOTR variants.
139  * To understand the *effectively* part read on.
140  *
141  * ndp_lookup() and ndp_add() implicitly does NCE_REFHOLD. So wherever we
142  * make ire to nce association after calling these functions,
143  * we effectively want to end up with NCE_REFHOLD_NOTR,
144  * We call this macro to achieve this effect. This macro changes
145  * a NCE_REFHOLD to a NCE_REFHOLD_NOTR. The macro's NCE_REFRELE
146  * cancels off ndp_lookup[ndp_add]'s implicit NCE_REFHOLD, and what
147  * you are left with is a NCE_REFHOLD_NOTR
148  */
149 #define	NCE_REFHOLD_TO_REFHOLD_NOTR(nce) {	\
150 	NCE_REFHOLD_NOTR(nce);			\
151 	NCE_REFRELE(nce);			\
152 }
153 
154 /*
155  * find the next ire_t entry in the ire_next chain starting at ire
156  * that is not CONDEMNED.  ire is set to NULL if we reach the end of the list.
157  * Caller must hold the ire_bucket lock.
158  */
159 
160 #define	IRE_FIND_NEXT_ORIGIN(ire) {					\
161 	while ((ire) != NULL && ((ire)->ire_marks & IRE_MARK_CONDEMNED))\
162 		(ire) = (ire)->ire_next;				\
163 }
164 
165 
166 /* Structure for ire_cache_count() */
167 typedef struct {
168 	int	icc_total;	/* Total number of IRE_CACHE */
169 	int	icc_unused;	/* # off/no PMTU unused since last reclaim */
170 	int	icc_offlink;	/* # offlink without PMTU information */
171 	int	icc_pmtu;	/* # offlink with PMTU information */
172 	int	icc_onlink;	/* # onlink */
173 } ire_cache_count_t;
174 
175 /*
176  * Structure for ire_cache_reclaim(). Each field is a fraction i.e. 1 meaning
177  * reclaim all, N meaning reclaim 1/Nth of all entries, 0 meaning reclaim none.
178  */
179 typedef struct {
180 	int	icr_unused;	/* Fraction for unused since last reclaim */
181 	int	icr_offlink;	/* Fraction for offlink without PMTU info */
182 	int	icr_pmtu;	/* Fraction for offlink with PMTU info */
183 	int	icr_onlink;	/* Fraction for onlink */
184 } ire_cache_reclaim_t;
185 
186 typedef struct {
187 	uint64_t ire_stats_alloced;	/* # of ires alloced */
188 	uint64_t ire_stats_freed;	/* # of ires freed */
189 	uint64_t ire_stats_inserted;	/* # of ires inserted in the bucket */
190 	uint64_t ire_stats_deleted;	/* # of ires deleted from the bucket */
191 } ire_stats_t;
192 
193 extern ire_stats_t ire_stats_v4;
194 extern uint32_t ip_cache_table_size;
195 extern uint32_t ip6_cache_table_size;
196 extern irb_t *ip_cache_table;
197 extern uint32_t ip6_ftable_hash_size;
198 
199 /*
200  * We use atomics so that we get an accurate accounting on the ires.
201  * Otherwise we can't determine leaks correctly.
202  */
203 #define	BUMP_IRE_STATS(ire_stats, x) atomic_add_64(&(ire_stats).x, 1)
204 
205 extern irb_t *ip_forwarding_table_v6[];
206 extern irb_t *ip_cache_table_v6;
207 extern irb_t *ip_mrtun_table;
208 extern irb_t *ip_srcif_table;
209 extern kmutex_t ire_ft_init_lock;
210 extern kmutex_t	ire_mrtun_lock;
211 extern kmutex_t ire_srcif_table_lock;
212 extern ire_stats_t ire_stats_v6;
213 extern uint_t	ire_mrtun_count;
214 extern uint_t ire_srcif_table_count;
215 
216 #ifdef _KERNEL
217 struct ts_label_s;
218 
219 extern	ipaddr_t	ip_plen_to_mask(uint_t);
220 extern	in6_addr_t	*ip_plen_to_mask_v6(uint_t, in6_addr_t *);
221 
222 extern	int	ip_ire_advise(queue_t *, mblk_t *, cred_t *);
223 extern	int	ip_ire_delete(queue_t *, mblk_t *, cred_t *);
224 extern	boolean_t ip_ire_clookup_and_delete(ipaddr_t, ipif_t *);
225 extern	void	ip_ire_clookup_and_delete_v6(const in6_addr_t *);
226 
227 extern	int	ip_ire_report(queue_t *, mblk_t *, caddr_t, cred_t *);
228 extern	int	ip_ire_report_mrtun(queue_t *, mblk_t *, caddr_t, cred_t *);
229 extern	int	ip_ire_report_srcif(queue_t *, mblk_t *, caddr_t, cred_t *);
230 extern	int	ip_ire_report_v6(queue_t *, mblk_t *, caddr_t, cred_t *);
231 extern	void	ire_report_ftable(ire_t *, char *);
232 
233 extern	void	ip_ire_req(queue_t *, mblk_t *);
234 
235 extern	int	ip_mask_to_plen(ipaddr_t);
236 extern	int	ip_mask_to_plen_v6(const in6_addr_t *);
237 
238 extern	ire_t	*ipif_to_ire(const ipif_t *);
239 extern	ire_t	*ipif_to_ire_v6(const ipif_t *);
240 
241 extern	int	ire_add(ire_t **, queue_t *, mblk_t *, ipsq_func_t, boolean_t);
242 extern	int	ire_add_mrtun(ire_t **, queue_t *, mblk_t *, ipsq_func_t);
243 extern	void	ire_add_then_send(queue_t *, ire_t *, mblk_t *);
244 extern	int	ire_add_v6(ire_t **, queue_t *, mblk_t *, ipsq_func_t);
245 extern	int	ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q,
246     mblk_t *mp, ipsq_func_t func);
247 extern	void	ire_atomic_end(irb_t *irb_ptr, ire_t *ire);
248 
249 extern	void	ire_cache_count(ire_t *, char *);
250 extern	ire_t	*ire_cache_lookup(ipaddr_t, zoneid_t,
251     const struct ts_label_s *);
252 extern	ire_t	*ire_cache_lookup_v6(const in6_addr_t *, zoneid_t,
253     const struct ts_label_s *);
254 extern	void	ire_cache_reclaim(ire_t *, char *);
255 
256 extern	void	ire_check_bcast_present(ipif_t *, ipaddr_t, int, boolean_t *,
257     boolean_t *);
258 
259 extern	ire_t	*ire_create_mp(uchar_t *, uchar_t *, uchar_t *, uchar_t *,
260     uchar_t *, uint_t, mblk_t *, queue_t *, queue_t *, ushort_t, mblk_t *,
261     ipif_t *, ill_t *, ipaddr_t, uint32_t, uint32_t, uint32_t, const iulp_t *,
262     tsol_gc_t *, tsol_gcgrp_t *);
263 
264 extern	ire_t	*ire_create(uchar_t *, uchar_t *, uchar_t *, uchar_t *,
265     uchar_t *, uint_t *, mblk_t *, queue_t *, queue_t *, ushort_t, mblk_t *,
266     ipif_t *, ill_t *, ipaddr_t, uint32_t, uint32_t, uint32_t, const iulp_t *,
267     tsol_gc_t *, tsol_gcgrp_t *);
268 
269 extern	ire_t	**ire_check_and_create_bcast(ipif_t *, ipaddr_t,
270     ire_t **, int);
271 extern	ire_t	**ire_create_bcast(ipif_t *, ipaddr_t, ire_t **);
272 extern	ire_t	*ire_init(ire_t *, uchar_t *, uchar_t *, uchar_t *,
273     uchar_t *, uchar_t *, uint_t *, mblk_t *, queue_t *, queue_t *, ushort_t,
274     mblk_t *, ipif_t *, ill_t *, ipaddr_t, uint32_t, uint32_t, uint32_t,
275     const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *);
276 
277 extern	boolean_t ire_init_common(ire_t *, uint_t *, mblk_t *, queue_t *,
278     queue_t *, ushort_t, mblk_t *, ipif_t *, ill_t *, uint32_t,
279     uint32_t, uint32_t, uchar_t, const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *);
280 
281 extern	ire_t	*ire_create_v6(const in6_addr_t *, const in6_addr_t *,
282     const in6_addr_t *, const in6_addr_t *, uint_t *, mblk_t *, queue_t *,
283     queue_t *, ushort_t, mblk_t *, ipif_t *,
284     const in6_addr_t *, uint32_t, uint32_t, uint_t, const iulp_t *,
285     tsol_gc_t *, tsol_gcgrp_t *);
286 
287 extern	ire_t	*ire_create_mp_v6(const in6_addr_t *, const in6_addr_t *,
288     const in6_addr_t *, const in6_addr_t *, mblk_t *, queue_t *,
289     queue_t *, ushort_t, mblk_t *, ipif_t *,
290     const in6_addr_t *, uint32_t, uint32_t, uint_t, const iulp_t *,
291     tsol_gc_t *, tsol_gcgrp_t *);
292 
293 extern	ire_t	*ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *,
294     const in6_addr_t *, const in6_addr_t *, uint_t *, mblk_t *, queue_t *,
295     queue_t *, ushort_t, mblk_t *, ipif_t *,
296     const in6_addr_t *, uint32_t, uint32_t, uint_t, const iulp_t *,
297     tsol_gc_t *, tsol_gcgrp_t *);
298 
299 extern	void	ire_clookup_delete_cache_gw(ipaddr_t, zoneid_t);
300 extern	void	ire_clookup_delete_cache_gw_v6(const in6_addr_t *, zoneid_t);
301 
302 extern	ire_t	*ire_ctable_lookup(ipaddr_t, ipaddr_t, int, const ipif_t *,
303     zoneid_t, const struct ts_label_s *, int);
304 
305 extern	ire_t	*ire_ctable_lookup_v6(const in6_addr_t *, const in6_addr_t *,
306     int, const ipif_t *, zoneid_t, const struct ts_label_s *, int);
307 
308 extern	void	ire_delete(ire_t *);
309 extern	void	ire_delete_cache_gw(ire_t *, char *);
310 extern	void	ire_delete_cache_gw_v6(ire_t *, char *);
311 extern	void	ire_delete_cache_v6(ire_t *, char *);
312 extern	void	ire_delete_srcif(ire_t *);
313 extern	void	ire_delete_v6(ire_t *);
314 
315 extern	void	ire_expire(ire_t *, char *);
316 extern	void	ire_fastpath_flush(ire_t *, void *);
317 extern	boolean_t ire_fastpath_update(ire_t *, void *);
318 
319 extern	void	ire_flush_cache_v4(ire_t *, int);
320 extern	void	ire_flush_cache_v6(ire_t *, int);
321 
322 extern	ire_t	*ire_ftable_lookup_v6(const in6_addr_t *, const in6_addr_t *,
323     const in6_addr_t *, int, const ipif_t *, ire_t **, zoneid_t,
324     uint32_t, const struct ts_label_s *, int);
325 
326 extern	ire_t	*ire_ihandle_lookup_onlink(ire_t *);
327 extern	ire_t	*ire_ihandle_lookup_offlink(ire_t *, ire_t *);
328 extern	ire_t	*ire_ihandle_lookup_offlink_v6(ire_t *, ire_t *);
329 
330 extern	boolean_t	ire_local_same_ill_group(ire_t *, ire_t *);
331 extern	boolean_t	ire_local_ok_across_zones(ire_t *, zoneid_t, void *,
332 			    const struct ts_label_s *tsl);
333 
334 extern	ire_t 	*ire_lookup_local(zoneid_t);
335 extern	ire_t 	*ire_lookup_local_v6(zoneid_t);
336 
337 extern  ire_t	*ire_lookup_multi(ipaddr_t, zoneid_t);
338 extern  ire_t	*ire_lookup_multi_v6(const in6_addr_t *, zoneid_t);
339 
340 extern ire_t	*ire_mrtun_lookup(ipaddr_t, ill_t *);
341 
342 extern	void	ire_refrele(ire_t *);
343 extern	void	ire_refrele_notr(ire_t *);
344 extern	ire_t	*ire_route_lookup(ipaddr_t, ipaddr_t, ipaddr_t, int,
345     const ipif_t *, ire_t **, zoneid_t, const struct ts_label_s *, int);
346 
347 extern	ire_t	*ire_route_lookup_v6(const in6_addr_t *, const in6_addr_t *,
348     const in6_addr_t *, int, const ipif_t *, ire_t **, zoneid_t,
349     const struct ts_label_s *, int);
350 
351 extern	ire_t	*ire_srcif_table_lookup(ipaddr_t, int, ipif_t *, ill_t *, int);
352 extern ill_t	*ire_to_ill(const ire_t *);
353 
354 extern	void	ire_walk(pfv_t, void *);
355 extern	void	ire_walk_ill(uint_t, uint_t, pfv_t, void *, ill_t *);
356 extern	void	ire_walk_ill_mrtun(uint_t, uint_t, pfv_t, void *, ill_t *);
357 extern	void	ire_walk_ill_v4(uint_t, uint_t, pfv_t, void *, ill_t *);
358 extern	void	ire_walk_ill_v6(uint_t, uint_t, pfv_t, void *, ill_t *);
359 extern	void	ire_walk_v4(pfv_t, void *, zoneid_t);
360 extern  void	ire_walk_ill_tables(uint_t match_flags, uint_t ire_type,
361     pfv_t func, void *arg, size_t ftbl_sz, size_t htbl_sz,
362     irb_t **ipftbl, size_t ctbl_sz, irb_t *ipctbl, ill_t *ill,
363     zoneid_t zoneid);
364 extern	void	ire_walk_srcif_table_v4(pfv_t, void *);
365 extern	void	ire_walk_v6(pfv_t, void *, zoneid_t);
366 
367 extern boolean_t	ire_multirt_lookup(ire_t **, ire_t **, uint32_t,
368     const struct ts_label_s *);
369 extern boolean_t	ire_multirt_need_resolve(ipaddr_t,
370     const struct ts_label_s *);
371 extern boolean_t	ire_multirt_lookup_v6(ire_t **, ire_t **, uint32_t,
372     const struct ts_label_s *);
373 extern boolean_t	ire_multirt_need_resolve_v6(const in6_addr_t *,
374     const struct ts_label_s *);
375 
376 extern ire_t	*ipif_lookup_multi_ire(ipif_t *, ipaddr_t);
377 extern ire_t	*ipif_lookup_multi_ire_v6(ipif_t *, const in6_addr_t *);
378 
379 extern void	ire_fastpath_list_dispatch(ill_t *,
380     boolean_t (*)(ire_t *, void *), void *);
381 extern void	ire_fastpath_list_delete(ill_t *, ire_t *);
382 
383 extern ire_t	*ire_get_next_bcast_ire(ire_t *, ire_t *);
384 extern ire_t	*ire_get_next_default_ire(ire_t *, ire_t *);
385 
386 extern  void	ire_arpresolve(ire_t *,  ill_t *);
387 extern  void	ire_freemblk(ire_t *);
388 extern  void	ire_fastpath(ire_t *);
389 extern boolean_t	ire_match_args(ire_t *, ipaddr_t, ipaddr_t, ipaddr_t,
390     int, const ipif_t *, zoneid_t, uint32_t, const struct ts_label_s *, int);
391 extern  int	ire_nce_init(ire_t *, mblk_t *, mblk_t *);
392 extern  boolean_t	ire_walk_ill_match(uint_t, uint_t, ire_t *, ill_t *,
393     zoneid_t);
394 
395 #endif /* _KERNEL */
396 
397 #ifdef	__cplusplus
398 }
399 #endif
400 
401 #endif	/* _INET_IP_IRE_H */
402