xref: /titanic_51/usr/src/uts/common/inet/ip/ip2mac.c (revision 3d5869cc80c26887ee2c78b0f35e58f574e97192)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Functions to implement IP address -> link layer address (PSARC 2006/482)
29  */
30 #include <inet/ip2mac.h>
31 #include <inet/ip2mac_impl.h>
32 #include <sys/zone.h>
33 #include <inet/ip_ndp.h>
34 #include <inet/ip_if.h>
35 #include <inet/ip6.h>
36 
37 /*
38  * dispatch pending callbacks.
39  */
40 void
41 ncec_cb_dispatch(ncec_t *ncec)
42 {
43 	ncec_cb_t *ncec_cb;
44 	ip2mac_t ip2m;
45 
46 	mutex_enter(&ncec->ncec_lock);
47 	if (list_is_empty(&ncec->ncec_cb)) {
48 		mutex_exit(&ncec->ncec_lock);
49 		return;
50 	}
51 	ncec_ip2mac_response(&ip2m, ncec);
52 	ncec_cb_refhold_locked(ncec);
53 	/*
54 	 * IP does not hold internal locks like nce_lock across calls to
55 	 * other subsystems for fear of recursive lock entry and lock
56 	 * hierarchy violation. The caller may be holding locks across
57 	 * the call to IP. (It would be ideal if no subsystem holds locks
58 	 * across calls into another subsystem, especially if calls can
59 	 * happen in either direction).
60 	 */
61 	ncec_cb = list_head(&ncec->ncec_cb);
62 	for (; ncec_cb != NULL; ncec_cb = list_next(&ncec->ncec_cb, ncec_cb)) {
63 		if (ncec_cb->ncec_cb_flags & NCE_CB_DISPATCHED)
64 			continue;
65 		ncec_cb->ncec_cb_flags |= NCE_CB_DISPATCHED;
66 		mutex_exit(&ncec->ncec_lock);
67 		(*ncec_cb->ncec_cb_func)(&ip2m, ncec_cb->ncec_cb_arg);
68 		mutex_enter(&ncec->ncec_lock);
69 	}
70 	ncec_cb_refrele(ncec);
71 	mutex_exit(&ncec->ncec_lock);
72 }
73 
74 /*
75  * fill up the ip2m response fields with inforamation from the nce.
76  */
77 void
78 ncec_ip2mac_response(ip2mac_t *ip2m, ncec_t *ncec)
79 {
80 	boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION);
81 	sin_t	*sin;
82 	sin6_t	*sin6;
83 	struct sockaddr_dl *sdl;
84 
85 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
86 	bzero(ip2m, sizeof (*ip2m));
87 	if (NCE_ISREACHABLE(ncec) && !NCE_ISCONDEMNED(ncec))
88 		ip2m->ip2mac_err = 0;
89 	else
90 		ip2m->ip2mac_err = ESRCH;
91 	if (isv6) {
92 		sin6 = (sin6_t *)&ip2m->ip2mac_pa;
93 		sin6->sin6_family = AF_INET6;
94 		sin6->sin6_addr = ncec->ncec_addr;
95 	} else {
96 		sin = (sin_t *)&ip2m->ip2mac_pa;
97 		sin->sin_family = AF_INET;
98 		IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &sin->sin_addr);
99 	}
100 	if (ip2m->ip2mac_err == 0) {
101 		sdl = &ip2m->ip2mac_ha;
102 		sdl->sdl_family = AF_LINK;
103 		sdl->sdl_type = ncec->ncec_ill->ill_type;
104 		/*
105 		 * should we put ncec_ill->ill_name in there? why?
106 		 * likewise for the sdl_index
107 		 */
108 		sdl->sdl_nlen = 0;
109 		sdl->sdl_alen = ncec->ncec_ill->ill_phys_addr_length;
110 		if (ncec->ncec_lladdr != NULL)
111 			bcopy(ncec->ncec_lladdr, LLADDR(sdl), sdl->sdl_alen);
112 	}
113 }
114 
115 void
116 ncec_cb_refhold_locked(ncec_t *ncec)
117 {
118 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
119 	ncec->ncec_cb_walker_cnt++;
120 }
121 
122 void
123 ncec_cb_refrele(ncec_t *ncec)
124 {
125 	ncec_cb_t *ncec_cb, *ncec_cb_next = NULL;
126 
127 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
128 	if (--ncec->ncec_cb_walker_cnt == 0) {
129 		for (ncec_cb = list_head(&ncec->ncec_cb); ncec_cb != NULL;
130 		    ncec_cb = ncec_cb_next) {
131 
132 			ncec_cb_next = list_next(&ncec->ncec_cb, ncec_cb);
133 			if ((ncec_cb->ncec_cb_flags & NCE_CB_DISPATCHED) == 0)
134 				continue;
135 			list_remove(&ncec->ncec_cb, ncec_cb);
136 			kmem_free(ncec_cb, sizeof (*ncec_cb));
137 		}
138 	}
139 }
140 
141 /*
142  * add a callback to the nce, so that the callback can be invoked
143  * after address resolution succeeds/fails.
144  */
145 static ip2mac_id_t
146 ncec_add_cb(ncec_t *ncec, ip2mac_callback_t *cb, void *cbarg)
147 {
148 	ncec_cb_t	*nce_cb;
149 	ip2mac_id_t	ip2mid = NULL;
150 
151 	ASSERT(MUTEX_HELD(&ncec->ncec_lock));
152 	if ((nce_cb = kmem_zalloc(sizeof (*nce_cb), KM_NOSLEEP)) == NULL)
153 		return (ip2mid);
154 	nce_cb->ncec_cb_func = cb;
155 	nce_cb->ncec_cb_arg = cbarg;
156 	/*
157 	 * We identify the ncec_cb_t during cancellation by the address
158 	 * of the nce_cb_t itself, and, as a short-cut for eliminating
159 	 * clear mismatches, only look in the callback list of ncec's
160 	 * whose address is equal to the nce_cb_id.
161 	 */
162 	nce_cb->ncec_cb_id = ncec; /* no refs! just an address */
163 	list_insert_tail(&ncec->ncec_cb, nce_cb);
164 	ip2mid = ncec;  /* this is the id to be used in ip2mac_cancel */
165 
166 	return (nce_cb);
167 }
168 
169 /*
170  * Resolve an IP address to a link-layer address using the data-structures
171  * defined in PSARC 2006/482. If the current link-layer address for the
172  * IP address is not known, the state-machine for resolving the resolution
173  * will be triggered, and the callback function (*cb) will be invoked after
174  * the resolution completes.
175  */
176 ip2mac_id_t
177 ip2mac(uint_t op, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg,
178     zoneid_t zoneid)
179 {
180 	ncec_t		*ncec;
181 	nce_t		*nce = NULL;
182 	boolean_t	isv6;
183 	ill_t		*ill;
184 	netstack_t	*ns;
185 	ip_stack_t	*ipst;
186 	ip2mac_id_t	ip2mid = NULL;
187 	sin_t		*sin;
188 	sin6_t		*sin6;
189 	int		err;
190 	uint64_t	delta;
191 	boolean_t	need_resolve = B_FALSE;
192 
193 	isv6 = (ip2m->ip2mac_pa.ss_family == AF_INET6);
194 
195 	ns = netstack_find_by_zoneid(zoneid);
196 	if (ns == NULL) {
197 		ip2m->ip2mac_err = EINVAL;
198 		return (NULL);
199 	}
200 	/*
201 	 * For exclusive stacks we reset the zoneid to zero
202 	 * since IP uses the global zoneid in the exclusive stacks.
203 	 */
204 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
205 		zoneid = GLOBAL_ZONEID;
206 	ipst = ns->netstack_ip;
207 	/*
208 	 * find the ill from the ip2m->ip2mac_ifindex
209 	 */
210 	ill = ill_lookup_on_ifindex(ip2m->ip2mac_ifindex, isv6, ipst);
211 	if (ill == NULL) {
212 		ip2m->ip2mac_err = ENXIO;
213 		netstack_rele(ns);
214 		return (NULL);
215 	}
216 	if (isv6) {
217 		sin6 = (sin6_t *)&ip2m->ip2mac_pa;
218 		if (op == IP2MAC_LOOKUP) {
219 			nce = nce_lookup_v6(ill, &sin6->sin6_addr);
220 		} else {
221 			err = nce_lookup_then_add_v6(ill, NULL,
222 			    ill->ill_phys_addr_length,
223 			    &sin6->sin6_addr, 0, ND_UNCHANGED, &nce);
224 		}
225 	} else  {
226 		sin = (sin_t *)&ip2m->ip2mac_pa;
227 		if (op == IP2MAC_LOOKUP) {
228 			nce = nce_lookup_v4(ill, &sin->sin_addr.s_addr);
229 		} else {
230 			err = nce_lookup_then_add_v4(ill, NULL,
231 			    ill->ill_phys_addr_length,
232 			    &sin->sin_addr.s_addr, 0, ND_UNCHANGED, &nce);
233 		}
234 	}
235 	if (op == IP2MAC_LOOKUP) {
236 		if (nce == NULL) {
237 			ip2m->ip2mac_err = ESRCH;
238 			goto done;
239 		}
240 		ncec = nce->nce_common;
241 		delta = TICK_TO_MSEC(ddi_get_lbolt64()) - ncec->ncec_last;
242 		mutex_enter(&ncec->ncec_lock);
243 		if (NCE_ISREACHABLE(ncec) &&
244 		    delta < (uint64_t)ill->ill_reachable_time) {
245 			ncec_ip2mac_response(ip2m, ncec);
246 			ip2m->ip2mac_err = 0;
247 		} else {
248 			ip2m->ip2mac_err = ESRCH;
249 		}
250 		mutex_exit(&ncec->ncec_lock);
251 		goto done;
252 	} else {
253 		if (err != 0 && err != EEXIST) {
254 			ip2m->ip2mac_err = err;
255 			goto done;
256 		}
257 	}
258 	ncec = nce->nce_common;
259 	delta = TICK_TO_MSEC(ddi_get_lbolt64()) - ncec->ncec_last;
260 	mutex_enter(&ncec->ncec_lock);
261 	if (NCE_ISCONDEMNED(ncec)) {
262 		ip2m->ip2mac_err = ESRCH;
263 	} else {
264 		if (NCE_ISREACHABLE(ncec)) {
265 			if (NCE_MYADDR(ncec) ||
266 			    delta < (uint64_t)ill->ill_reachable_time) {
267 				ncec_ip2mac_response(ip2m, ncec);
268 				ip2m->ip2mac_err = 0;
269 				mutex_exit(&ncec->ncec_lock);
270 				goto done;
271 			}
272 			/*
273 			 * Since we do not control the packet output
274 			 * path for ip2mac() callers, we need to verify
275 			 * if the existing information in the nce is
276 			 * very old, and retrigger resolution if necessary.
277 			 * We will not return the existing stale
278 			 * information until it is verified through a
279 			 * resolver request/response exchange.
280 			 *
281 			 * In the future, we may want to support extensions
282 			 * that do additional callbacks on link-layer updates,
283 			 * so that we can return the stale information but
284 			 * also update the caller if the lladdr changes.
285 			 */
286 			ncec->ncec_rcnt = ill->ill_xmit_count;
287 			ncec->ncec_state = ND_PROBE;
288 			need_resolve = B_TRUE; /* reachable but very old nce */
289 		} else if (ncec->ncec_state == ND_INITIAL) {
290 			need_resolve = B_TRUE; /* ND_INITIAL nce */
291 			ncec->ncec_state = ND_INCOMPLETE;
292 		}
293 		/*
294 		 * NCE not known to be reachable in the recent past. We must
295 		 * reconfirm the information before returning it to the caller
296 		 */
297 		if (ncec->ncec_rcnt > 0) {
298 			/*
299 			 * Still resolving this ncec, so we can queue the
300 			 * callback information in ncec->ncec_cb
301 			 */
302 			ip2mid = ncec_add_cb(ncec, cb, cbarg);
303 			ip2m->ip2mac_err = EINPROGRESS;
304 		} else {
305 			/*
306 			 * No more retransmits allowed -- resolution failed.
307 			 */
308 			ip2m->ip2mac_err = ESRCH;
309 		}
310 	}
311 	mutex_exit(&ncec->ncec_lock);
312 done:
313 	/*
314 	 * if NCE_ISREACHABLE(ncec) but very old, or if it is ND_INITIAL,
315 	 * trigger resolve.
316 	 */
317 	if (need_resolve)
318 		ip_ndp_resolve(ncec);
319 	if (nce != NULL)
320 		nce_refrele(nce);
321 	netstack_rele(ns);
322 	ill_refrele(ill);
323 	return (ip2mid);
324 }
325 
326 /*
327  * data passed to ncec_walk for canceling outstanding callbacks.
328  */
329 typedef struct ip2mac_cancel_data_s {
330 	ip2mac_id_t ip2m_cancel_id;
331 	int	ip2m_cancel_err;
332 } ip2mac_cancel_data_t;
333 
334 /*
335  * callback invoked for each active ncec. If the ip2mac_id_t corresponds
336  * to an active nce_cb_t in the ncec's callback list, we want to remove
337  * the callback (if there are no walkers) or return EBUSY to the caller
338  */
339 static int
340 ip2mac_cancel_callback(ncec_t *ncec, void *arg)
341 {
342 	ip2mac_cancel_data_t *ip2m_wdata = arg;
343 	ncec_cb_t *ip2m_nce_cb = ip2m_wdata->ip2m_cancel_id;
344 	ncec_cb_t *ncec_cb;
345 
346 	if (ip2m_nce_cb->ncec_cb_id != ncec)
347 		return (0);
348 
349 	mutex_enter(&ncec->ncec_lock);
350 	if (list_is_empty(&ncec->ncec_cb)) {
351 		mutex_exit(&ncec->ncec_lock);
352 		return (0);
353 	}
354 	/*
355 	 * IP does not hold internal locks like nce_lock across calls to
356 	 * other subsystems for fear of recursive lock entry and lock
357 	 * hierarchy violation. The caller may be holding locks across
358 	 * the call to IP. (It would be ideal if no subsystem holds locks
359 	 * across calls into another subsystem, especially if calls can
360 	 * happen in either direction).
361 	 */
362 	ncec_cb = list_head(&ncec->ncec_cb);
363 	for (; ncec_cb != NULL; ncec_cb = list_next(&ncec->ncec_cb, ncec_cb)) {
364 		if (ncec_cb != ip2m_nce_cb)
365 			continue;
366 		/*
367 		 * If there are no walkers we can remove the nce_cb.
368 		 * Otherwise the exiting walker will clean up.
369 		 */
370 		if (ncec->ncec_cb_walker_cnt == 0) {
371 			list_remove(&ncec->ncec_cb, ncec_cb);
372 		} else {
373 			ip2m_wdata->ip2m_cancel_err = EBUSY;
374 		}
375 		break;
376 	}
377 	mutex_exit(&ncec->ncec_lock);
378 	return (0);
379 }
380 
381 /*
382  * cancel an outstanding timeout set up via ip2mac
383  */
384 int
385 ip2mac_cancel(ip2mac_id_t ip2mid, zoneid_t zoneid)
386 {
387 	netstack_t	*ns;
388 	ip_stack_t	*ipst;
389 	ip2mac_cancel_data_t ip2m_wdata;
390 
391 	ns = netstack_find_by_zoneid(zoneid);
392 	if (ns == NULL) {
393 		ip2m_wdata.ip2m_cancel_err = EINVAL;
394 		return (ip2m_wdata.ip2m_cancel_err);
395 	}
396 	/*
397 	 * For exclusive stacks we reset the zoneid to zero
398 	 * since IP uses the global zoneid in the exclusive stacks.
399 	 */
400 	if (ns->netstack_stackid != GLOBAL_NETSTACKID)
401 		zoneid = GLOBAL_ZONEID;
402 	ipst = ns->netstack_ip;
403 
404 	ip2m_wdata.ip2m_cancel_id = ip2mid;
405 	ip2m_wdata.ip2m_cancel_err = 0;
406 	ncec_walk(NULL, ip2mac_cancel_callback, &ip2m_wdata, ipst);
407 	/*
408 	 * We may return EBUSY if a walk to dispatch callbacks is
409 	 * in progress, in which case the caller needs to synchronize
410 	 * with the registered callback function to make sure the
411 	 * module does not exit when there is a callback pending.
412 	 */
413 	netstack_rele(ns);
414 	return (ip2m_wdata.ip2m_cancel_err);
415 }
416