xref: /titanic_51/usr/src/uts/common/avs/ns/sdbc/CACHE_SPEC.txt (revision 73a9f52fa0e4b5d00dc5f3a6314e6837e47f88cf)
1# CDDL HEADER START
2#
3# The contents of this file are subject to the terms of the
4# Common Development and Distribution License (the "License").
5# You may not use this file except in compliance with the License.
6#
7# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
8# or http://www.opensolaris.org/os/licensing.
9# See the License for the specific language governing permissions
10# and limitations under the License.
11#
12# When distributing Covered Code, include this CDDL HEADER in each
13# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
14# If applicable, add the following below this CDDL HEADER, with the
15# fields enclosed by brackets "[]" replaced with your own identifying
16# information: Portions Copyright [yyyy] [name of copyright owner]
17#
18# CDDL HEADER END
19#
20# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
21# Use is subject to license terms.
22#
23# $Id: CACHE_SPEC,v 3.6.0.0 1998/01/05 22:55:19 idumois Exp $
24#
25
26	"sd" cache layer
27	----------------
28#include <sys/sd/sd.h>
29
30The "sd" layer provides a common interface to the functionality
31described below.  It will also allow switching to a direct to disk
32version, so that a new cache module could be loaded.
33The functions are basically the same as those below,
34but named without the leading underscore.
35(ie sd_alloc_buf instead of _sd_alloc_buf)
36
37
38	"sdbc" -- storage device block cache (aka blkc)
39	-----------------------------------------------
40
41#include "uts/sd/sdbc/sd_cache.h"	/* for SDBC interface */
42#include "sys/sd/sd.h"			/* for generic SD interface */
43
44(all interaction is in terms of the buf_handle.
45
46Currently buf_handle is declared as:
47
48#define _SD_MAX_BLKS	64
49#define _SD_MAX_FBAS	(_SD_MAX_BLKS << FBA_SHFT)
50
51typedef struct _sd_buf_handle {
52	int bh_cd;		/* actually bh_buf.sb_cd */
53	int bh_fba_pos;		/* bh_buf.sb_pos */
54	int bh_fba_len;		/* bh_buf.sb_len */
55	int bh_flag;		/* bh_buf.sb_flag */
56	int bh_error;		/* bh_buf.sb_error */
57	_sd_vec_t bh_bufvec[_SD_MAX_BLKS]; /* bh_buf.sb_vec */
58	void (*bh_disconnect_cb)();
59	void (*bh_read_cb)();
60	void (*bh_write_cb)();
61	......
62} _sd_buf_handle_t;
63
64
65typedef struct sd_vec_s {		/* Scatter gather element */
66	unsigned char	*sv_addr;	/* Virtual address of data */
67	unsigned int	sv_vme;		/* VME address of data */
68	int		sv_len;		/* Data length in bytes */
69} sd_vec_t;
70
71The upper level routines should reference only: handle->bh_error,
72handle->bh_bufvec The bh_bufvec is an array of _sd_vec_t with the
73last item in the array having a NULL bufaddr.
74
75IMPORTANT: The handle should be treated read-only and never be modified.
76
77	1) Multiple accesses to a single file will be supported.
78	(Side effect: If a process owning cache blocks of a files attempts
79	to allocate overlapping cache blocks, it will be a
80	deadlock condition.)
81
82	2) Multiple writes to an allocated block will be supported. It
83	is no longer necessary to free and re-allocate between writes.
84
85	3) _SD_NOBLOCK is equivalent of async_io -- the io will be initiated
86	if required with the call returning _SD_PENDING. A callback
87	(read or write) will be called at io end action.
88
89	4) Disconnect hints to ckd will be provided by the use of
90	either psema or thread_bind() when io needs to be initiated.
91
92
93NOTE:
94	fba_pos = disk block number, each block being 512 bytes.
95	fba_len = len in disk blocks, each block being 512 bytes.
96		Thus, 512 bytes = 1 fba_len, 1024 = 2 fba_len etc...
97
98Hints:
99	_SD_WRTHRU: write through mode.
100		This hint can be set on a node, a device or per access.
101	_SD_FORCED_WRTHRU: forced write through (node down or flow control)
102		If this hint is cleared,  when only one node is up,
103		_sd_uncommit() will not work properly, and a second
104		failure could result in lost data.
105		This is a node hint.
106	_SD_NOCACHE: reuse cache blocks immediately instead of keeping
107		in lru order.
108		This hint can be set on a device or per access.
109
110Interface:
111
112_sd_buf_handle_t *
113_sd_alloc_handle(discon_cb, read_cb, write_cb)
114	void (*discon_cb)();
115	void (*read_cb)();
116	void (*write_cb)();
117
118	The callbacks can be NULL if you do not want any callbacks.
119	Else, the callbacks will be stored in the handle, and will be
120	called at specific points in the cache. (Its up to  the
121	callback to do what is necessary, including disconnecting
122	from the channel)
123
124	Usage: for better performance, an application could allocate
125	a handle (or as many handles as is required)  upfront and
126	use it later on in the cache calls.
127
128	Not allocating and managing the handles would mean a new
129	handle will be allocated and freed during _sd_alloc_buf
130	and _sd_freebuf.
131
132int
133_sd_free_handle(handle)
134	_sd_buf_handle_t *handle;
135
136	Only handles that are allocated through _sd_alloc_handle
137	should be freed with this call.
138
139int
140_sd_alloc_buf (cd, fba_pos, fba_len, flag, handle_p)
141	int cd;
142	int fba_pos;
143	int fba_len;
144	int flag;
145	_sd_buf_handle_t **handle_p;
146
147	cd = cache descriptor. Results in an error if this node does
148		not own this disk and the other node has not crashed.
149		(ie. requests must be routed to the correct node)
150		(see fault tolerant aspects discussed elsewhere)
151
152	fba_pos = disk position in multiples of 512 byte blocks.
153	fba_len = length in multiples of 512 bytes blocks.
154		(NOTE: This cannot exceed _SD_MAX_FBAS)
155
156	flag = None, one or more of the following (described below):
157		_SD_RDBUF | SD_WRBUF | _SD_RDWRBUF | _SD_PINNABLE |
158		_SD_NOBLOCK | _SD_NOCACHE | _SD_WRTHRU
159
160	handle_p = (*handle_p = handle to be used for this call)
161		If *handle_p == NULL, a new handle will be
162		allocated. _sd_free_buf will free up any handles
163                    allocated in this fashion.
164		NOTE: Handles allocated in this fashion will not have
165			any callbacks registered in them. As such,
166			_SD_NOBLOCK flag along with a NULL handle would
167			result in the io being lost.
168
169	return: Error number if > 0
170		possible errors:
171			EINVAL if arguments are incorrect or
172				cache not initialized or
173				device not open.
174			E2BIG if this request is a read and such a large
175			request cannot be currently satisfied. (break up
176			the io or re-issue at a later point)
177			EIO or any other errno that the driver might return.
178		Note: on error, the handle is not active,
179		and also is freed if *handle_p was NULL.
180
181	if 0 or less, status will be one of:
182	   _SD_DONE: buffer is ready, and ready to be used.
183		(with the blocks valid if _SD_RDBUF is set)
184	   _SD_PENDING:
185		read callback, if one has been registered in the handle,
186		will be called to complete this request.
187	   _SD_HIT:  Same as _SD_DONE, read was satisfied by cache,
188		or no blocking required for write buffer.
189
190	Note:	_SD_RDBUF will issue the read if necessary.
191		_SD_WRBUF allocates a network address to reflect to
192			mirror node on _sd_write().
193		~_SD_RDBUF allocates buffers but does NOT pre-read;
194			use _sd_read() to fill in (portions) as req'd.
195
196	Note:	flag == (_SD_RDBUF|_SD_WRTHRU|_SD_NOCACHE) will
197		clear valid bits (that are not dirty) thus read direct
198		from disk, without requiring a hash invalidate.
199
200
201int
202_sd_write (handle, fba_pos, fba_len, flag)
203	_sd_buf_handle_t *handle;
204	int fba_pos, fba_len;
205	int flag;
206{
207	handle = handle previously allocated in allocate buf.
208          fba_pos and fba_len have to be within the allocated portion.
209	int flag. Flag: _SD_NOBLOCK | SD_WRTHRU
210
211	Attempting to write to a handle that was not allocated for write
212	will return error (EINVAL)
213
214	returns:  errno if return > 0
215	if 0 or less, return  will be one of:
216	   _SD_PENDING: will be returned only if _SD_NOBLOCK is set AND
217		either the flag is _SD_WRTHRU or the other node is down,
218		or the device/node is in write through mode
219	   _SD_DONE: is returned if the block has been written to the disk.
220	   _SD_HIT: write block in cache..
221
222int
223_sd_read (handle, fba_pos, fba_len, flag)
224	_sd_buf_handle_t *handle;
225	int fba_pos, fba_len;
226	int flag;
227
228	handle = handle previously allocated in allocate buf.
229          fba_pos and fba_len have to be within the allocated portion.
230	int flag. Flag: _SD_NOBLOCK
231
232	returns:  errno if return > 0
233		error E2BIG if this request is big and cannot be currently
234		 satisfied. (break up the io or re-issue at a later point)
235
236	if 0 or less, return  will be one of:
237	   _SD_PENDING: will be returned only if _SD_NOBLOCK is set and
238		we need to do an io.
239	   _SD_HIT: is returned if the blocks were satisfied by cache.
240	   _SD_DONE: some blocks were read from disk.
241
242int
243_sd_uncommit(handle, fba_pos, fba_len, flag)
244	_sd_buf_handle_t *handle;
245	int fba_pos, fba_len;
246	int flag;
247
248	handle = handle previously allocated in allocate buf.
249          fba_pos and fba_len have to be within the allocated portion.
250	flag: reserved for future use.
251
252	_sd_uncommit could block and cannot be called from a
253		"non-blocking" context.
254	(This is under review, from the ckd point of view)
255
256	returns 0 (_SD_DONE) else errno;
257
258
259int
260_sd_zero (handle, fba_pos, fba_len, flag)
261	_sd_buf_handle_t *handle;
262	int fba_pos, fba_len;
263	int flag;
264
265	handle = handle previously allocated in allocate buf.
266          fba_pos and fba_len have to be within the allocated portion.
267	zero the buffer described by the handle.
268	flag: _SD_NOBLOCK | _SD_WRTHRU
269
270	The call commits data to disk.
271	This call has characteristics similar to _sd_write.
272
273	returns: errno if return > 0
274		if 0 or less, return will be one of:
275		_SD_DONE
276		_SD_PENDING
277
278_sd_copy (handle1, handle2, fba_pos1, fba_pos2, fba_len)
279	_sd_buf_handle_t *handle1, handle2;
280	int fba_pos1, fba_pos2, fba_len;
281
282	Copies relevant data from handle1 to handle2.
283	Useful for mirroring, remote dual copy, backup while open,
284	in-house tests, etc.
285
286	This call does not commit data to disk - you must explicitly
287	call _sd_write() on handle2 if that is what you want.
288
289	returns: errno if return > 0:
290			 EIO - if sd module should do a generic bcopy
291			 others - real error (passed to user)
292		 if 0 or less, return will be:
293		 	_SD_DONE - sucess
294
295_sd_free_buf(handle)
296	_sd_buf_handle_t *handle;
297
298	handle = handle previously allocated in allocate buf.
299
300	returns 0 (_SD_DONE) else errno;
301
302_sd_open(filename, flag)
303	char *filename;
304	int flag;
305
306	returns a cache descriptor, or negative error number.
307	Typically use _sd_attach_cd(cd) before accessing the device.
308	Note: if devices is already open, it returns the same cache descriptor.
309	Currently there is no reference count; so one _sd_close() closes
310	the cache descriptor (in all contexts).
311
312_sd_close(cd)
313	int cd;
314	Similar to _sd_detach_cd below.
315	Note: intended to be called when terminating the cache; and not during
316	normal operation.  No reference count (see above).
317	Returns: 0 success, EIO.
318
319_sd_detach_cd(cd)
320	re-reflect any pinned blocks to the other side,
321	or wait for writes to flush; and invalidate that device's hash entries,
322	and relinquish device responsibility.
323	Returns: 0 success, EIO, EAGAIN.
324
325_sd_attach_cd(cd)
326	If device has pinned blocks then scan for and re-pin those blocks
327	(same idea as "node recovery" process, but per-device);
328	and assert device responsibility.
329
330_sd_notify_all_pin(cd)
331	rescan list of failed blocks and re-issue the pinned callback to
332	simulation.
333
334
335_sd_register_pinned(func)
336	void (*func)();
337    callback (*func)(cd, fba_pos, fba_len) when disk write fails,
338    and _SD_PENDING was specified on alloc.
339
340_sd_register_unpinned(func)
341	void (*func)();
342    callback (*func)(cd, fba_pos, fba_len) when data previously pinned
343    is successfully written to disk.
344
345_sd_register_down(func)
346	void (*func)();
347    callback (*func)() when health monitor detects the other node went down.
348
349_sd_set_hint(cd, hint)
350_sd_clear_hint(cd, hint)
351_sd_get_cd_hint(cd, &hint)
352_sd_set_node_hint(hint)
353_sd_clear_node_hint(hint)
354_sd_get_node_hint(&hint)
355
356    where hint is _SD_NOCACHE and _SD_WRTHRU. (Write through being synchronous
357	write and will be the default if the second node dies.)
358
359   _SD_NOCACHE: hint indicating that the current access need not be
360	cached for later consumption.
361
362
363_sd_discard_pinned(cd, fba_pos, fba_len)
364	call from ckd into cache, called when data that was earlier
365	on pinned can be discarded from the cache.
366
367	returns: 0 or error.
368	(error = EINVAL if the discard could not be done)
369
370(note: there is an inherent race between the unpinned callback and
371_sd_discard_pinned which could put the data on disk in an inconsistent
372state)
373
374
375Failover support:
376
377The Nodedown callback will be called, if one has been registered. This
378will happen as soon as the other node has been detected to have gone down,
379or when the cache is disabled on the other node.
380
381The amount of time to for this callback to happen after the node goes down
382is not deterministic.
383
384Access to a mirror node's devices is only valid from the point the
385nodedown callback is called till the other node is determined to be back
386in operation.
387
388Access to mirror node's devices while recovery is in progress will
389block the access till the recovery is complete.
390