1 /*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 1997, 1998
5 * Sleepycat Software. All rights reserved.
6 */
7 #include "config.h"
8
9 #ifndef lint
10 static const char sccsid[] = "@(#)mp_fopen.c 10.60 (Sleepycat) 1/1/99";
11 #endif /* not lint */
12
13 #ifndef NO_SYSTEM_INCLUDES
14 #include <sys/types.h>
15
16 #include <errno.h>
17 #include <string.h>
18 #endif
19
20 #include "db_int.h"
21 #include "shqueue.h"
22 #include "db_shash.h"
23 #include "mp.h"
24 #include "common_ext.h"
25
26 static int __memp_mf_close __P((DB_MPOOL *, DB_MPOOLFILE *));
27 static int __memp_mf_open __P((DB_MPOOL *,
28 const char *, size_t, db_pgno_t, DB_MPOOL_FINFO *, MPOOLFILE **));
29
30 /*
31 * memp_fopen --
32 * Open a backing file for the memory pool.
33 */
34 int
memp_fopen(dbmp,path,flags,mode,pagesize,finfop,retp)35 memp_fopen(dbmp, path, flags, mode, pagesize, finfop, retp)
36 DB_MPOOL *dbmp;
37 const char *path;
38 u_int32_t flags;
39 int mode;
40 size_t pagesize;
41 DB_MPOOL_FINFO *finfop;
42 DB_MPOOLFILE **retp;
43 {
44 int ret;
45
46 MP_PANIC_CHECK(dbmp);
47
48 /* Validate arguments. */
49 if ((ret = __db_fchk(dbmp->dbenv,
50 "memp_fopen", flags, DB_CREATE | DB_NOMMAP | DB_RDONLY)) != 0)
51 return (ret);
52
53 /* Require a non-zero pagesize. */
54 if (pagesize == 0) {
55 __db_err(dbmp->dbenv, "memp_fopen: pagesize not specified");
56 return (EINVAL);
57 }
58 if (finfop != NULL && finfop->clear_len > pagesize)
59 return (EINVAL);
60
61 return (__memp_fopen(dbmp,
62 NULL, path, flags, mode, pagesize, 1, finfop, retp));
63 }
64
65 /*
66 * __memp_fopen --
67 * Open a backing file for the memory pool; internal version.
68 *
69 * PUBLIC: int __memp_fopen __P((DB_MPOOL *, MPOOLFILE *, const char *,
70 * PUBLIC: u_int32_t, int, size_t, int, DB_MPOOL_FINFO *, DB_MPOOLFILE **));
71 */
72 int
__memp_fopen(dbmp,mfp,path,flags,mode,pagesize,needlock,finfop,retp)73 __memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
74 DB_MPOOL *dbmp;
75 MPOOLFILE *mfp;
76 const char *path;
77 u_int32_t flags;
78 int mode, needlock;
79 size_t pagesize;
80 DB_MPOOL_FINFO *finfop;
81 DB_MPOOLFILE **retp;
82 {
83 DB_ENV *dbenv;
84 DB_MPOOLFILE *dbmfp;
85 DB_MPOOL_FINFO finfo;
86 db_pgno_t last_pgno;
87 size_t maxmap;
88 u_int32_t mbytes, bytes;
89 int ret;
90 u_int8_t idbuf[DB_FILE_ID_LEN];
91 char *rpath;
92
93 dbenv = dbmp->dbenv;
94 ret = 0;
95 rpath = NULL;
96
97 /*
98 * If mfp is provided, we take the DB_MPOOL_FINFO information from
99 * the mfp. We don't bother initializing everything, because some
100 * of them are expensive to acquire. If no mfp is provided and the
101 * finfop argument is NULL, we default the values.
102 */
103 if (finfop == NULL) {
104 memset(&finfo, 0, sizeof(finfo));
105 if (mfp != NULL) {
106 finfo.ftype = mfp->ftype;
107 finfo.pgcookie = NULL;
108 finfo.fileid = NULL;
109 finfo.lsn_offset = mfp->lsn_off;
110 finfo.clear_len = mfp->clear_len;
111 } else {
112 finfo.ftype = 0;
113 finfo.pgcookie = NULL;
114 finfo.fileid = NULL;
115 finfo.lsn_offset = -1;
116 finfo.clear_len = 0;
117 }
118 finfop = &finfo;
119 }
120
121 /* Allocate and initialize the per-process structure. */
122 if ((ret = __os_calloc(1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0)
123 return (ret);
124 dbmfp->dbmp = dbmp;
125 dbmfp->fd = -1;
126 dbmfp->ref = 1;
127 if (LF_ISSET(DB_RDONLY))
128 F_SET(dbmfp, MP_READONLY);
129
130 if (path == NULL) {
131 if (LF_ISSET(DB_RDONLY)) {
132 __db_err(dbenv,
133 "memp_fopen: temporary files can't be readonly");
134 ret = EINVAL;
135 goto err;
136 }
137 last_pgno = 0;
138 } else {
139 /* Get the real name for this file and open it. */
140 if ((ret = __db_appname(dbenv,
141 DB_APP_DATA, NULL, path, 0, NULL, &rpath)) != 0)
142 goto err;
143 if ((ret = __db_open(rpath,
144 LF_ISSET(DB_CREATE | DB_RDONLY),
145 DB_CREATE | DB_RDONLY, mode, &dbmfp->fd)) != 0) {
146 __db_err(dbenv, "%s: %s", rpath, strerror(ret));
147 goto err;
148 }
149
150 /*
151 * Don't permit files that aren't a multiple of the pagesize,
152 * and find the number of the last page in the file, all the
153 * time being careful not to overflow 32 bits.
154 *
155 * !!!
156 * We can't use off_t's here, or in any code in the mainline
157 * library for that matter. (We have to use them in the os
158 * stubs, of course, as there are system calls that take them
159 * as arguments.) The reason is that some customers build in
160 * environments where an off_t is 32-bits, but still run where
161 * offsets are 64-bits, and they pay us a lot of money.
162 */
163 if ((ret = __os_ioinfo(rpath,
164 dbmfp->fd, &mbytes, &bytes, NULL)) != 0) {
165 __db_err(dbenv, "%s: %s", rpath, strerror(ret));
166 goto err;
167 }
168
169 /* Page sizes have to be a power-of-two, ignore mbytes. */
170 if (bytes % pagesize != 0) {
171 __db_err(dbenv,
172 "%s: file size not a multiple of the pagesize",
173 rpath);
174 ret = EINVAL;
175 goto err;
176 }
177
178 last_pgno = mbytes * (MEGABYTE / pagesize);
179 last_pgno += bytes / pagesize;
180
181 /* Correction: page numbers are zero-based, not 1-based. */
182 if (last_pgno != 0)
183 --last_pgno;
184
185 /*
186 * Get the file id if we weren't given one. Generated file id's
187 * don't use timestamps, otherwise there'd be no chance of any
188 * other process joining the party.
189 */
190 if (finfop->fileid == NULL) {
191 if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0)
192 goto err;
193 finfop->fileid = idbuf;
194 }
195 }
196
197 /*
198 * If we weren't provided an underlying shared object to join with,
199 * find/allocate the shared file objects. Also allocate space for
200 * for the per-process thread lock.
201 */
202 if (needlock)
203 LOCKREGION(dbmp);
204
205 if (mfp == NULL)
206 ret = __memp_mf_open(dbmp,
207 path, pagesize, last_pgno, finfop, &mfp);
208 else {
209 ++mfp->ref;
210 ret = 0;
211 }
212 if (ret == 0 &&
213 F_ISSET(dbmp, MP_LOCKHANDLE) && (ret =
214 __memp_alloc(dbmp, sizeof(db_mutex_t), NULL, &dbmfp->mutexp)) == 0)
215 LOCKINIT(dbmp, dbmfp->mutexp);
216
217 if (needlock)
218 UNLOCKREGION(dbmp);
219 if (ret != 0)
220 goto err;
221
222 dbmfp->mfp = mfp;
223
224 /*
225 * If a file:
226 * + is read-only
227 * + isn't temporary
228 * + doesn't require any pgin/pgout support
229 * + the DB_NOMMAP flag wasn't set
230 * + and is less than mp_mmapsize bytes in size
231 *
232 * we can mmap it instead of reading/writing buffers. Don't do error
233 * checking based on the mmap call failure. We want to do normal I/O
234 * on the file if the reason we failed was because the file was on an
235 * NFS mounted partition, and we can fail in buffer I/O just as easily
236 * as here.
237 *
238 * XXX
239 * We'd like to test to see if the file is too big to mmap. Since we
240 * don't know what size or type off_t's or size_t's are, or the largest
241 * unsigned integral type is, or what random insanity the local C
242 * compiler will perpetrate, doing the comparison in a portable way is
243 * flatly impossible. Hope that mmap fails if the file is too large.
244 */
245 #define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 Mb. */
246 if (F_ISSET(mfp, MP_CAN_MMAP)) {
247 if (!F_ISSET(dbmfp, MP_READONLY))
248 F_CLR(mfp, MP_CAN_MMAP);
249 if (path == NULL)
250 F_CLR(mfp, MP_CAN_MMAP);
251 if (finfop->ftype != 0)
252 F_CLR(mfp, MP_CAN_MMAP);
253 if (LF_ISSET(DB_NOMMAP))
254 F_CLR(mfp, MP_CAN_MMAP);
255 maxmap = dbenv == NULL || dbenv->mp_mmapsize == 0 ?
256 DB_MAXMMAPSIZE : dbenv->mp_mmapsize;
257 if (mbytes > maxmap / MEGABYTE ||
258 (mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE))
259 F_CLR(mfp, MP_CAN_MMAP);
260 }
261 dbmfp->addr = NULL;
262 if (F_ISSET(mfp, MP_CAN_MMAP)) {
263 dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
264 if (__db_mapfile(rpath,
265 dbmfp->fd, dbmfp->len, 1, &dbmfp->addr) != 0) {
266 dbmfp->addr = NULL;
267 F_CLR(mfp, MP_CAN_MMAP);
268 }
269 }
270 if (rpath != NULL)
271 __os_freestr(rpath);
272
273 LOCKHANDLE(dbmp, dbmp->mutexp);
274 TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
275 UNLOCKHANDLE(dbmp, dbmp->mutexp);
276
277 *retp = dbmfp;
278 return (0);
279
280 err: /*
281 * Note that we do not have to free the thread mutex, because we
282 * never get to here after we have successfully allocated it.
283 */
284 if (rpath != NULL)
285 __os_freestr(rpath);
286 if (dbmfp->fd != -1)
287 (void)__os_close(dbmfp->fd);
288 if (dbmfp != NULL)
289 __os_free(dbmfp, sizeof(DB_MPOOLFILE));
290 return (ret);
291 }
292
293 /*
294 * __memp_mf_open --
295 * Open an MPOOLFILE.
296 */
297 static int
__memp_mf_open(dbmp,path,pagesize,last_pgno,finfop,retp)298 __memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, retp)
299 DB_MPOOL *dbmp;
300 const char *path;
301 size_t pagesize;
302 db_pgno_t last_pgno;
303 DB_MPOOL_FINFO *finfop;
304 MPOOLFILE **retp;
305 {
306 MPOOLFILE *mfp;
307 int ret;
308 void *p;
309
310 #define ISTEMPORARY (path == NULL)
311
312 /*
313 * Walk the list of MPOOLFILE's, looking for a matching file.
314 * Temporary files can't match previous files.
315 */
316 if (!ISTEMPORARY)
317 for (mfp = SH_TAILQ_FIRST(&dbmp->mp->mpfq, __mpoolfile);
318 mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
319 if (F_ISSET(mfp, MP_TEMP))
320 continue;
321 if (!memcmp(finfop->fileid,
322 R_ADDR(dbmp, mfp->fileid_off), DB_FILE_ID_LEN)) {
323 if (finfop->clear_len != mfp->clear_len ||
324 finfop->ftype != mfp->ftype ||
325 pagesize != mfp->stat.st_pagesize) {
326 __db_err(dbmp->dbenv,
327 "%s: ftype, clear length or pagesize changed",
328 path);
329 return (EINVAL);
330 }
331
332 /* Found it: increment the reference count. */
333 ++mfp->ref;
334 *retp = mfp;
335 return (0);
336 }
337 }
338
339 /* Allocate a new MPOOLFILE. */
340 if ((ret = __memp_alloc(dbmp, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
341 return (ret);
342 *retp = mfp;
343
344 /* Initialize the structure. */
345 memset(mfp, 0, sizeof(MPOOLFILE));
346 mfp->ref = 1;
347 mfp->ftype = finfop->ftype;
348 mfp->lsn_off = finfop->lsn_offset;
349 mfp->clear_len = finfop->clear_len;
350
351 /*
352 * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a memp_fget,
353 * we have to know the last page in the file. Figure it out and save
354 * it away.
355 */
356 mfp->stat.st_pagesize = pagesize;
357 mfp->orig_last_pgno = mfp->last_pgno = last_pgno;
358
359 if (ISTEMPORARY)
360 F_SET(mfp, MP_TEMP);
361 else {
362 /* Copy the file path into shared memory. */
363 if ((ret = __memp_alloc(dbmp,
364 strlen(path) + 1, &mfp->path_off, &p)) != 0)
365 goto err;
366 memcpy(p, path, strlen(path) + 1);
367
368 /* Copy the file identification string into shared memory. */
369 if ((ret = __memp_alloc(dbmp,
370 DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
371 goto err;
372 memcpy(p, finfop->fileid, DB_FILE_ID_LEN);
373
374 F_SET(mfp, MP_CAN_MMAP);
375 }
376
377 /* Copy the page cookie into shared memory. */
378 if (finfop->pgcookie == NULL || finfop->pgcookie->size == 0) {
379 mfp->pgcookie_len = 0;
380 mfp->pgcookie_off = 0;
381 } else {
382 if ((ret = __memp_alloc(dbmp,
383 finfop->pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
384 goto err;
385 memcpy(p, finfop->pgcookie->data, finfop->pgcookie->size);
386 mfp->pgcookie_len = finfop->pgcookie->size;
387 }
388
389 /* Prepend the MPOOLFILE to the list of MPOOLFILE's. */
390 SH_TAILQ_INSERT_HEAD(&dbmp->mp->mpfq, mfp, q, __mpoolfile);
391
392 if (0) {
393 err: if (mfp->path_off != 0)
394 __db_shalloc_free(dbmp->addr,
395 R_ADDR(dbmp, mfp->path_off));
396 if (mfp->fileid_off != 0)
397 __db_shalloc_free(dbmp->addr,
398 R_ADDR(dbmp, mfp->fileid_off));
399 if (mfp != NULL)
400 __db_shalloc_free(dbmp->addr, mfp);
401 mfp = NULL;
402 }
403 return (0);
404 }
405
406 /*
407 * memp_fclose --
408 * Close a backing file for the memory pool.
409 */
410 int
memp_fclose(dbmfp)411 memp_fclose(dbmfp)
412 DB_MPOOLFILE *dbmfp;
413 {
414 DB_MPOOL *dbmp;
415 int ret, t_ret;
416
417 dbmp = dbmfp->dbmp;
418 ret = 0;
419
420 MP_PANIC_CHECK(dbmp);
421
422 for (;;) {
423 LOCKHANDLE(dbmp, dbmp->mutexp);
424
425 /*
426 * We have to reference count DB_MPOOLFILE structures as other
427 * threads may be using them. The problem only happens if the
428 * application makes a bad design choice. Here's the path:
429 *
430 * Thread A opens a database.
431 * Thread B uses thread A's DB_MPOOLFILE to write a buffer
432 * in order to free up memory in the mpool cache.
433 * Thread A closes the database while thread B is using the
434 * DB_MPOOLFILE structure.
435 *
436 * By opening all databases before creating the threads, and
437 * closing them after the threads have exited, applications
438 * get better performance and avoid the problem path entirely.
439 *
440 * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer
441 * is a short-term lock, even in worst case, since we better be
442 * the only thread of control using the DB_MPOOLFILE structure
443 * to read pages *into* the cache. Wait until we're the only
444 * reference holder and remove the DB_MPOOLFILE structure from
445 * the list, so nobody else can even find it.
446 */
447 if (dbmfp->ref == 1) {
448 TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
449 break;
450 }
451 UNLOCKHANDLE(dbmp, dbmp->mutexp);
452
453 (void)__os_sleep(1, 0);
454 }
455 UNLOCKHANDLE(dbmp, dbmp->mutexp);
456
457 /* Complain if pinned blocks never returned. */
458 if (dbmfp->pinref != 0)
459 __db_err(dbmp->dbenv, "%s: close: %lu blocks left pinned",
460 __memp_fn(dbmfp), (u_long)dbmfp->pinref);
461
462 /* Close the underlying MPOOLFILE. */
463 (void)__memp_mf_close(dbmp, dbmfp);
464
465 /* Discard any mmap information. */
466 if (dbmfp->addr != NULL &&
467 (ret = __db_unmapfile(dbmfp->addr, dbmfp->len)) != 0)
468 __db_err(dbmp->dbenv,
469 "%s: %s", __memp_fn(dbmfp), strerror(ret));
470
471 /* Close the file; temporary files may not yet have been created. */
472 if (dbmfp->fd != -1 && (t_ret = __os_close(dbmfp->fd)) != 0) {
473 __db_err(dbmp->dbenv,
474 "%s: %s", __memp_fn(dbmfp), strerror(t_ret));
475 if (ret != 0)
476 t_ret = ret;
477 }
478
479 /* Free memory. */
480 if (dbmfp->mutexp != NULL) {
481 LOCKREGION(dbmp);
482 __db_shalloc_free(dbmp->addr, dbmfp->mutexp);
483 UNLOCKREGION(dbmp);
484 }
485
486 /* Discard the DB_MPOOLFILE structure. */
487 __os_free(dbmfp, sizeof(DB_MPOOLFILE));
488
489 return (ret);
490 }
491
492 /*
493 * __memp_mf_close --
494 * Close down an MPOOLFILE.
495 */
496 static int
__memp_mf_close(dbmp,dbmfp)497 __memp_mf_close(dbmp, dbmfp)
498 DB_MPOOL *dbmp;
499 DB_MPOOLFILE *dbmfp;
500 {
501 BH *bhp, *nbhp;
502 MPOOL *mp;
503 MPOOLFILE *mfp;
504 size_t mf_offset;
505
506 mp = dbmp->mp;
507 mfp = dbmfp->mfp;
508
509 LOCKREGION(dbmp);
510
511 /* If more than a single reference, simply decrement. */
512 if (mfp->ref > 1) {
513 --mfp->ref;
514 goto ret1;
515 }
516
517 /*
518 * Move any BH's held by the file to the free list. We don't free the
519 * memory itself because we may be discarding the memory pool, and it's
520 * fairly expensive to reintegrate the buffers back into the region for
521 * no purpose.
522 */
523 mf_offset = R_OFFSET(dbmp, mfp);
524 for (bhp = SH_TAILQ_FIRST(&mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
525 nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
526
527 #ifdef DEBUG_NO_DIRTY
528 /* Complain if we find any blocks that were left dirty. */
529 if (F_ISSET(bhp, BH_DIRTY))
530 __db_err(dbmp->dbenv,
531 "%s: close: pgno %lu left dirty; ref %lu",
532 __memp_fn(dbmfp),
533 (u_long)bhp->pgno, (u_long)bhp->ref);
534 #endif
535
536 if (bhp->mf_offset == mf_offset) {
537 if (F_ISSET(bhp, BH_DIRTY)) {
538 ++mp->stat.st_page_clean;
539 --mp->stat.st_page_dirty;
540 }
541 __memp_bhfree(dbmp, mfp, bhp, 0);
542 SH_TAILQ_INSERT_HEAD(&mp->bhfq, bhp, q, __bh);
543 }
544 }
545
546 /* Delete from the list of MPOOLFILEs. */
547 SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile);
548
549 /* Free the space. */
550 if (mfp->path_off != 0)
551 __db_shalloc_free(dbmp->addr, R_ADDR(dbmp, mfp->path_off));
552 if (mfp->fileid_off != 0)
553 __db_shalloc_free(dbmp->addr, R_ADDR(dbmp, mfp->fileid_off));
554 if (mfp->pgcookie_off != 0)
555 __db_shalloc_free(dbmp->addr, R_ADDR(dbmp, mfp->pgcookie_off));
556 __db_shalloc_free(dbmp->addr, mfp);
557
558 ret1: UNLOCKREGION(dbmp);
559 return (0);
560 }
561