xref: /titanic_50/usr/src/uts/common/inet/ip/ip2mac.c (revision a5669307eaef64af8519feb70d42f0aa0e7ec21a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Functions to implement IP address -> link layer address (PSARC 2006/482)
28  */
29 #include <inet/ip2mac.h>
30 #include <inet/ip2mac_impl.h>
31 #include <sys/zone.h>
32 #include <sys/dlpi.h>
33 #include <inet/ip_ndp.h>
34 #include <inet/ip_if.h>
35 #include <inet/ip6.h>
36 
37 /*
38  * dispatch pending callbacks.
39  */
40 void
41 nce_cb_dispatch(nce_t *nce)
42 {
43 	nce_cb_t *nce_cb = list_head(&nce->nce_cb);
44 	ip2mac_t ip2m;
45 
46 	mutex_enter(&nce->nce_lock);
47 	if (list_is_empty(&nce->nce_cb)) {
48 		mutex_exit(&nce->nce_lock);
49 		return;
50 	}
51 	nce_ip2mac_response(&ip2m, nce);
52 	nce_cb_refhold_locked(nce);
53 	/*
54 	 * IP does not hold internal locks like nce_lock across calls to
55 	 * other subsystems for fear of recursive lock entry and lock
56 	 * hierarchy violation. The caller may be holding locks across
57 	 * the call to IP. (It would be ideal if no subsystem holds locks
58 	 * across calls into another subsystem, especially if calls can
59 	 * happen in either direction).
60 	 */
61 	nce_cb = list_head(&nce->nce_cb);
62 	for (; nce_cb != NULL; nce_cb = list_next(&nce->nce_cb, nce_cb)) {
63 		if (nce_cb->nce_cb_flags & NCE_CB_DISPATCHED)
64 			continue;
65 		nce_cb->nce_cb_flags |= NCE_CB_DISPATCHED;
66 		mutex_exit(&nce->nce_lock);
67 		(*nce_cb->nce_cb_func)(&ip2m, nce_cb->nce_cb_arg);
68 		mutex_enter(&nce->nce_lock);
69 	}
70 	nce_cb_refrele(nce);
71 	mutex_exit(&nce->nce_lock);
72 }
73 
74 /*
75  * fill up the ip2m response fields with inforamation from the nce.
76  */
77 void
78 nce_ip2mac_response(ip2mac_t *ip2m, nce_t *nce)
79 {
80 	boolean_t isv6 = (nce->nce_ipversion == IPV6_VERSION);
81 	sin6_t	*sin6;
82 	struct sockaddr_dl *sdl;
83 	uchar_t *nce_lladdr;
84 
85 	ASSERT(MUTEX_HELD(&nce->nce_lock));
86 	bzero(ip2m, sizeof (*ip2m));
87 	if (NCE_ISREACHABLE(nce) && (nce->nce_flags & NCE_F_CONDEMNED) == 0)
88 		ip2m->ip2mac_err = 0;
89 	else
90 		ip2m->ip2mac_err = ESRCH;
91 	if (isv6) {
92 		sin6 = (sin6_t *)&ip2m->ip2mac_pa;
93 		sin6->sin6_family = AF_INET6;
94 		sin6->sin6_addr = nce->nce_addr;
95 	}
96 	if (ip2m->ip2mac_err == 0) {
97 		sdl = &ip2m->ip2mac_ha;
98 		sdl->sdl_family = AF_LINK;
99 		sdl->sdl_type = nce->nce_ill->ill_type;
100 		sdl->sdl_nlen = 0;
101 		sdl->sdl_alen = nce->nce_ill->ill_phys_addr_length;
102 		nce_lladdr = nce->nce_res_mp->b_rptr +
103 		    NCE_LL_ADDR_OFFSET(nce->nce_ill);
104 		bcopy(nce_lladdr, LLADDR(sdl), sdl->sdl_alen);
105 	}
106 }
107 
108 void
109 nce_cb_refhold_locked(nce_t *nce)
110 {
111 	ASSERT(MUTEX_HELD(&nce->nce_lock));
112 	nce->nce_cb_walker_cnt++;
113 }
114 
115 void
116 nce_cb_refrele(nce_t *nce)
117 {
118 	nce_cb_t *nce_cb, *nce_cb_next = NULL;
119 
120 	ASSERT(MUTEX_HELD(&nce->nce_lock));
121 	if (--nce->nce_cb_walker_cnt == 0) {
122 		for (nce_cb = list_head(&nce->nce_cb); nce_cb != NULL;
123 		    nce_cb = nce_cb_next) {
124 
125 			nce_cb_next = list_next(&nce->nce_cb, nce_cb);
126 			if ((nce_cb->nce_cb_flags & NCE_CB_DISPATCHED) == 0)
127 				continue;
128 			list_remove(&nce->nce_cb, nce_cb);
129 			kmem_free(nce_cb, sizeof (*nce_cb));
130 		}
131 	}
132 }
133 
134 /*
135  * add a callback to the nce, so that the callback can be invoked
136  * after address resolution succeeds/fails.
137  */
138 static ip2mac_id_t
139 nce_add_cb(nce_t *nce, ip2mac_callback_t *cb, void *cbarg)
140 {
141 	nce_cb_t	*nce_cb;
142 	ip2mac_id_t	ip2mid = NULL;
143 
144 	ASSERT(MUTEX_HELD(&nce->nce_lock));
145 	if ((nce_cb = kmem_zalloc(sizeof (*nce_cb), KM_NOSLEEP)) == NULL)
146 		return (ip2mid);
147 	nce_cb->nce_cb_func = cb;
148 	nce_cb->nce_cb_arg = cbarg;
149 	/*
150 	 * We identify the nce_cb_t during cancellation by the address
151 	 * of the nce_cb_t itself, and, as a short-cut for eliminating
152 	 * clear mismatches, only look in the callback list of nce's
153 	 * whose address is equal to the nce_cb_id.
154 	 */
155 	nce_cb->nce_cb_id = nce; /* no refs! just an address */
156 	list_insert_tail(&nce->nce_cb, nce_cb);
157 	ip2mid = nce;  /* this is the id to be used in ip2mac_cancel */
158 
159 	return (nce_cb);
160 }
161 
162 /*
163  * Resolve an IP address to a link-layer address using the data-structures
164  * defined in PSARC 2006/482. If the current link-layer address for the
165  * IP address is not known, the state-machine for resolving the resolution
166  * will be triggered, and the callback function (*cb) will be invoked after
167  * the resolution completes.
168  */
169 ip2mac_id_t
170 ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
171     zoneid_t zoneid)
172 {
173 	nce_t		*nce;
174 	boolean_t	isv6;
175 	ill_t		*ill;
176 	netstack_t	*ns;
177 	ip_stack_t	*ipst;
178 	ip2mac_id_t	ip2mid = NULL;
179 	sin6_t		*sin6;
180 	int		err;
181 	uint64_t	delta;
182 
183 	isv6 = (ip2m->ip2mac_pa.ss_family == AF_INET6);
184 
185 	if (!isv6) {
186 		/*
187 		 * IPv4 is not currently supported.
188 		 */
189 		ip2m->ip2mac_err = ENOTSUP;
190 		return (NULL);
191 	}
192 
193 	ns = netstack_find_by_zoneid(zoneid);
194 	if (ns == NULL) {
195 		ip2m->ip2mac_err = EINVAL;
196 		return (NULL);
197 	}
198 	/*
199 	 * For exclusive stacks we reset the zoneid to zero
200 	 * since IP uses the global zoneid in the exclusive stacks.
201 	 */
202 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
203 		zoneid = GLOBAL_ZONEID;
204 	ipst = ns->netstack_ip;
205 	/*
206 	 * find the ill from the ip2m->ip2mac_ifindex
207 	 */
208 	ill = ill_lookup_on_ifindex(ip2m->ip2mac_ifindex, isv6, NULL,
209 	    NULL, NULL, NULL, ipst);
210 	if (ill == NULL) {
211 		ip2m->ip2mac_err = ENXIO;
212 		netstack_rele(ns);
213 		return (NULL);
214 	}
215 	if (isv6) {
216 		sin6 = (sin6_t *)&ip2m->ip2mac_pa;
217 		if (flags == IP2MAC_LOOKUP) {
218 			nce = ndp_lookup_v6(ill, B_FALSE, &sin6->sin6_addr,
219 			    B_FALSE);
220 		} else {
221 			err = ndp_lookup_then_add_v6(ill, B_FALSE, NULL,
222 			    &sin6->sin6_addr, &ipv6_all_ones, &ipv6_all_zeros,
223 			    0, 0, ND_INCOMPLETE, &nce);
224 		}
225 	} else  {
226 		ip2m->ip2mac_err = ENOTSUP; /* yet. */
227 		goto done;
228 	}
229 	if (flags == IP2MAC_LOOKUP) {
230 		if (nce == NULL) {
231 			ip2m->ip2mac_err = ESRCH;
232 			goto done;
233 		}
234 		mutex_enter(&nce->nce_lock);
235 		if (NCE_ISREACHABLE(nce)) {
236 			nce_ip2mac_response(ip2m, nce);
237 			ip2m->ip2mac_err = 0;
238 		} else {
239 			ip2m->ip2mac_err = ESRCH;
240 		}
241 		mutex_exit(&nce->nce_lock);
242 		NCE_REFRELE(nce);
243 		goto done;
244 	} else {
245 		if (err != 0 && err != EEXIST) {
246 			ip2m->ip2mac_err = err;
247 			goto done;
248 		}
249 	}
250 	delta = TICK_TO_MSEC(lbolt64) - nce->nce_last;
251 	mutex_enter(&nce->nce_lock);
252 	if (nce->nce_flags & NCE_F_CONDEMNED) {
253 		ip2m->ip2mac_err = ESRCH;
254 	} else if (!NCE_ISREACHABLE(nce) ||
255 	    delta > (uint64_t)ill->ill_reachable_time) {
256 		if (NCE_ISREACHABLE(nce)) {
257 			/*
258 			 * Since we do not control the packet output
259 			 * path for ip2mac() callers, we need to verify
260 			 * if the existing information in the nce is
261 			 * very old, and retrigger resolution if necessary.
262 			 * We will not return the existing stale
263 			 * information until it is verified through a
264 			 * resolver request/response exchange.
265 			 *
266 			 * In the future, we may want to support extensions
267 			 * that do additional callbacks on link-layer updates,
268 			 * so that we can return the stale information but
269 			 * also update the caller if the lladdr changes.
270 			 */
271 			nce->nce_rcnt = ill->ill_xmit_count;
272 			nce->nce_state = ND_PROBE;
273 			err = 0; /* treat this nce as a new one */
274 		}
275 		if (nce->nce_rcnt > 0) {
276 			/*
277 			 * Still resolving this nce, so we can
278 			 * queue the callback information in nce->nce_cb
279 			 */
280 			ip2mid = nce_add_cb(nce, cb, cbarg);
281 			ip2m->ip2mac_err = EINPROGRESS;
282 		} else {
283 			/*
284 			 * Resolution failed.
285 			 */
286 			ip2m->ip2mac_err = ESRCH;
287 		}
288 	} else {
289 		nce_ip2mac_response(ip2m, nce);
290 		ip2m->ip2mac_err = 0;
291 	}
292 	if (ip2m->ip2mac_err == EINPROGRESS && err != EEXIST)
293 		ip_ndp_resolve(nce);
294 	mutex_exit(&nce->nce_lock);
295 	NCE_REFRELE(nce);
296 done:
297 	netstack_rele(ns);
298 	ill_refrele(ill);
299 	return (ip2mid);
300 }
301 
302 /*
303  * data passed to nce_walk for canceling outstanding callbacks.
304  */
305 typedef struct ip2mac_cancel_data_s {
306 	ip2mac_id_t ip2m_cancel_id;
307 	int	ip2m_cancel_err;
308 } ip2mac_cancel_data_t;
309 
310 /*
311  * callback invoked for each active nce. If the ip2mac_id_t corresponds
312  * to an active nce_cb_t in the nce's callback list, we want to remove
313  * the callback (if there are no walkers) or return EBUSY to the caller
314  */
315 static int
316 ip2mac_cancel_callback(nce_t *nce, void *arg)
317 {
318 	ip2mac_cancel_data_t *ip2m_wdata = arg;
319 	nce_cb_t *ip2m_nce_cb = ip2m_wdata->ip2m_cancel_id;
320 	nce_cb_t *nce_cb;
321 
322 	if (ip2m_nce_cb->nce_cb_id != nce)
323 		return (0);
324 
325 	mutex_enter(&nce->nce_lock);
326 	if (list_is_empty(&nce->nce_cb)) {
327 		mutex_exit(&nce->nce_lock);
328 		return (0);
329 	}
330 	/*
331 	 * IP does not hold internal locks like nce_lock across calls to
332 	 * other subsystems for fear of recursive lock entry and lock
333 	 * hierarchy violation. The caller may be holding locks across
334 	 * the call to IP. (It would be ideal if no subsystem holds locks
335 	 * across calls into another subsystem, especially if calls can
336 	 * happen in either direction).
337 	 */
338 	nce_cb = list_head(&nce->nce_cb);
339 	for (; nce_cb != NULL; nce_cb = list_next(&nce->nce_cb, nce_cb)) {
340 		if (nce_cb != ip2m_nce_cb)
341 			continue;
342 		/*
343 		 * If there are no walkers we can remove the nce_cb.
344 		 * Otherwise the exiting walker will clean up.
345 		 */
346 		if (nce->nce_cb_walker_cnt == 0) {
347 			list_remove(&nce->nce_cb, nce_cb);
348 		} else {
349 			ip2m_wdata->ip2m_cancel_err = EBUSY;
350 		}
351 		break;
352 	}
353 	mutex_exit(&nce->nce_lock);
354 	return (0);
355 }
356 
357 /*
358  * cancel an outstanding timeout set up via ip2mac
359  */
360 int
361 ip2mac_cancel(ip2mac_id_t ip2mid, zoneid_t zoneid)
362 {
363 	netstack_t	*ns;
364 	ip_stack_t	*ipst;
365 	ip2mac_cancel_data_t ip2m_wdata;
366 
367 	ns = netstack_find_by_zoneid(zoneid);
368 	if (ns == NULL) {
369 		ip2m_wdata.ip2m_cancel_err = EINVAL;
370 		return (ip2m_wdata.ip2m_cancel_err);
371 	}
372 	/*
373 	 * For exclusive stacks we reset the zoneid to zero
374 	 * since IP uses the global zoneid in the exclusive stacks.
375 	 */
376 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
377 		zoneid = GLOBAL_ZONEID;
378 	ipst = ns->netstack_ip;
379 
380 	ip2m_wdata.ip2m_cancel_id = ip2mid;
381 	ip2m_wdata.ip2m_cancel_err = 0;
382 	ndp_walk(NULL, ip2mac_cancel_callback, &ip2m_wdata, ipst);
383 	/*
384 	 * We may return EBUSY if a walk to dispatch callbacks is
385 	 * in progress, in which case the caller needs to synchronize
386 	 * with the registered callback function to make sure the
387 	 * module does not exit when there is a callback pending.
388 	 */
389 	netstack_rele(ns);
390 	return (ip2m_wdata.ip2m_cancel_err);
391 }
392