-/*
- * Copyright (c) 1996-1998, Index Data.
- * See the file LICENSE for details.
- * Heikki Levanto
- *
- * $Id: merge-d.c,v 1.21 1999-09-21 17:36:43 heikki Exp $
- *
- * bugs
- * (none)
- *
- * missing
- *
- * optimize
- * - Input filter: Eliminate del-ins pairs, tell if only one entry (or none)
- * - single-entry optimizing (keep the one entry in the dict, no block)
- * - study and optimize block sizes (later)
- * - find a way to decide the size of an empty diffblock (after merge)
- * - On allocating more blocks (in append and merge), check the order of
- * blocks, and if needed, swap them.
- * - Write a routine to save/load indexes into a block, save only as many
- * bytes as needed (size, diff, diffindexes)
- *
- *
- * caveat
- * There is a confusion about the block addresses. cat or type is the category,
- * pos or block is the block number. pp structures keep these two separate,
- * and combine when saving the pp. The next pointer in the pp structure is
- * also a combined address, but needs to be combined every time it is needed,
- * and separated when the partss are needed... This is done with the isamd_
- * _block, _type, and _addr macros. The _addr takes block and type as args,
- * in that order. This conflicts with the order these are often mentioned in
- * the debug log calls, and other places, leading to small mistakes here
- * and there.
- *
- * Needs cleaning! The way diff blocks are handled in append and reading is
- * quite different, and likely to give maintenance problems.
- *
- * log levels (set isamddebug=x in zebra.cfg (or what ever cfg file you use) )
- * 0 = no logging. Default
- * 1 = no logging here. isamd logs overall statistics
- * 2 = Each call to isamd_append with start address and no more
- * 3 = Start and type of append, start of merge, and result of append
- * 4 = Block allocations
- * 5 = Block-level operations (read/write)
- * 6 = Details about diff blocks etc.
- * 7 = Log each record as it passes the system (once)
- * 8 = Log raw and (de)coded data
- * 9 = Anything else that may be useful
- * .. = Anything needed to hunt a specific bug
- * (note that all tests in the code are like debug>3, which means 4 or above!)
- *
- * Design for the new and improved isamd
- * Key points:
- * - The first block is only diffs, no straight data
- * - Additional blocks are straight data
- * - When a diff block gets filled up, a data block is created by
- * merging the diffs with the data
- *
- * Structure
- * - Isamd_pp: buffer for diffs and for data
- * keep both pos, type, and combined address
- * routine to set the address
- * - diffbuf: lengths as short ints, or bytes for small blocks
- * - keys are of key_struct, not just a number of bytes.
- *
- * Routines
- * - isamd_append
- * - create_new_block if needed
- * - append_diffs
- * - load_diffs
- * - get diffend, start encoding
- * - while input data
- * - encode it
- * - if no room, then realloc block in larger size
- * - if still no room, merge and exit
- * - append in the block
- *
- * - merge
- * - just as before, except that merges also input data directly
- * - writes into new data blocks
- *
- *
- * - isamd.c: load firstpp, load datablock
- * save firstpp, save datablock
- * - Readlength, writelength - handling right size of len fields
- * - isamd_read_main_item: take also a merge input structure, and merge it too
- * - prefilter: cache two inputs, and check if they cancel.
- * - single-item optimization
- *
- * questions: Should we realloc firstblocks in a different size as the main
- * blocks. Makes a sideways seek, which is bound to be slowe. But saves some
- * update time. Compromise: alloc the first one in the size of the datablock,
- * but increase if necessary. Large blocks get a large diff, ok. Small ones
- * may get an extra seek in read, but save merges.
- */
+/* $Id: merge-d.c,v 1.30 2003-03-05 16:41:10 adam Exp $
+ Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002
+ Index Data Aps
+
+This file is part of the Zebra server.
+
+Zebra is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with Zebra; see the file LICENSE.zebra. If not, write to the
+Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+02111-1307, USA.
+*/
+
+
#define NEW_ISAM_D 1 /* not yet ready to delete the old one! */
#include <assert.h>
#include <string.h>
#include <stdio.h>
-#include <log.h>
+#include <yaz/log.h>
#include "../index/index.h"
#include "isamd-p.h"
int difftype;
};
-#define DT_NONE 0 // no diff, marks end of sequence
-#define DT_DIFF 1 // ordinarry diff
-#define DT_MAIN 2 // main data
-#define DT_INPU 3 // input data to be merged
-#define DT_DONE 4 // done with all input here
+#define DT_NONE 0 /* no diff, marks end of sequence */
+#define DT_DIFF 1 /* ordinarry diff */
+#define DT_MAIN 2 /* main data */
+#define DT_INPU 3 /* input data to be merged */
+#define DT_DONE 4 /* done with all input here */
F->m1 = F->m2 = 0;
F->r1 = F->r2 = FILTER_NOTYET;
filter_fill(F);
+ return F;
}
static void filter_close (FILTER F)
}
F->r1 = FILTER_NOTYET;
return res;
-
-#ifdef SKIPTHIS
- char *k_ptr = (char*) k;
- int res = (F->data->read_item)(F->data->clientData, &k_ptr, mode);
- if (F->is->method->debug > 9)
- logf(LOG_LOG,"filt_read: start %d.%d m=%d r=%d",
- k->sysno, k->seqno, *mode, res);
- return res;
-#endif
}
-static int filter_empty(FILTER F)
+static int filter_isempty(FILTER F)
{
- return 0;
+ return ( (0 == F->r1) && (0 == F->r2)) ;
}
+#if 0
static int filter_only_one(FILTER F)
{
- return 0;
+ return ( (0 != F->r1) && (0 == F->r2));
}
+#endif
+/* We may need backfilling, if we read a lonely key to make */
+/* a singleton, but its bitw will not fit in. Then we need to */
+/* process it normally, which means reading it again. So we */
+/* need to unread it first. Luckily the filter is empty at that */
+/* point */
+#if 0
+static void filter_backfill(FILTER F, struct it_key *k, int mode)
+{
+ assert(F->r1 == FILTER_NOTYET ); /* not overwriting data! */
+ F->k1=*k;
+ F->m1=mode;
+ F->r1=1; /* ok read */
+}
+#endif
/***************************************************************
+ * Singleton encoding
+ ***************************************************************/
+/* When there is only a single item, we don't allocate a block
+ * for it, but code it in the directory entry directly, if it
+ * fits.
+ */
+
+#define DEC_SYSBITS 15
+#define DEC_SEQBITS 15
+#define DEC_MASK(n) ((1<<(n))-1)
+
+#define SINGLETON_BIT (1<<(DEC_SYSBITS+DEC_SEQBITS+1))
+
+int is_singleton(ISAMD_P ipos)
+{
+ return 0; /* no singletons any more */
+ return ( ipos != 0 ) && ( ipos & SINGLETON_BIT );
+}
+
+
+int singleton_encode(struct it_key *k)
+/* encodes the key into one int. If it does not fit, returns 0 */
+{
+ return 0; /* no more singletons */
+ if ( (k->sysno & DEC_MASK(DEC_SYSBITS) ) != k->sysno )
+ return 0; /* no room dor sysno */
+ if ( (k->seqno & DEC_MASK(DEC_SYSBITS) ) != k->seqno )
+ return 0; /* no room dor sysno */
+ return (k->sysno | (k->seqno << DEC_SYSBITS) ) | SINGLETON_BIT;
+}
+
+void singleton_decode (int code, struct it_key *k)
+{
+ assert (code & SINGLETON_BIT);
+ k->sysno = code & DEC_MASK(DEC_SYSBITS);
+ code = code >> DEC_SYSBITS;
+ k->seqno = code & DEC_MASK(DEC_SEQBITS);
+}
+
+
+/***************************************************************
* General support routines
***************************************************************/
if ( (pp->is->method->debug > 0) &&
(pp->diffinfo[i].maxidx > pp->is->method->filecat[pp->cat].bsize) )
- { /* bug-hunting, this fails on some long runs that log too much */
+ {
logf(LOG_LOG,"Bad MaxIx!!! %s:%d: diffidx=%d",
__FILE__,__LINE__, diffidx);
logf(LOG_LOG,"i=%d maxix=%d bsz=%d", i, pp->diffinfo[i].maxidx,
if (0==pp->diffinfo[i].maxidx)
{
- if (pp->is->method->debug > 5) //!!! 4
+ if (pp->is->method->debug > 5) /* !!! 4 */
logf(LOG_LOG,"isamd_getDiffInfo:End mark at ix=%d n=%d",
diffidx, i);
return; /* end marker */
; /* find last diff */
if (p_key)
{ /* we have an extra item to inject into the merge */
- if (pp->is->method->debug >9) //!!!!!
+ if (pp->is->method->debug >9) /* !!!!! */
logf(LOG_LOG,"isamd_read_item: going to merge with %d.%d",
p_key->sysno, p_key->seqno);
pp->diffinfo[i].key = *p_key; /* the key merge could not handle */
assert(winner==0); /* if nothing found, nothing comes from a diff */
cmp= 0; /* eof */
}
- if (pp->is->method->debug >9)
- logf(LOG_LOG,"mergeDB4: sysno[1]=%d", pp->diffinfo[1].key.sysno); /*!*/
+ if (cmp)
+ ++(pp->is->no_read_keys);
+ else
+ ++(pp->is->no_read_eof);
+
return cmp;
} /* isamd_read_item */
static int merge ( ISAMD_PP firstpp, /* first pp (with diffs) */
struct it_key *p_key, /* the data item that didn't fit*/
- /* ISAMD_I data) */ /* more input data comes here */
- FILTER filt) /* more input data arriving here */
+ FILTER filt, /* more input data arriving here */
+ char *dictentry, /* the thin in the dictionary */
+ int dictlen) /* and its size */
{
int diffidx;
int killblk=0;
int r_more = 1;
ISAMD_PP pp;
ISAMD_PP readpp=firstpp;
- int retval=0;
+ int retpos=0;
int diffcat = firstpp->cat; /* keep the category of the diffblock even */
/* if it is going to be empty now. */
/* Alternative: Make it the minimal, and */
/* resize later. Saves disk, but will lead */
/* into bad seeks. */
- ++(readpp->is->files[0].no_merges);
+ ++(readpp->is->no_merges);
/* set up diffs as they should be for reading */
diffidx = ISAMD_BLOCK_OFFSET_1;
- //readpp->diffbuf=readpp->buf; // diffinfo has to duplicate it!
- //getDiffInfo(readpp); // first read will make the diffinfo, at init
if (readpp->is->method->debug >4)
logf(LOG_LOG,"isamd_merge: f=%d=%d:%d n=%d=%d:%d",
r_ptr= (char *) &r_key;
-/* r_more = isamd_read_item_merge( readpp, &r_ptr, p_key, data); */
r_more = isamd_read_item_merge( readpp, &r_ptr, p_key, filt);
if (!r_more)
{ /* oops, all data has been deleted! what to do??? */
if (readpp->is->method->debug >5)
logf(LOG_LOG,"isamd_merge:all data has been deleted (nk=%d) ",
readpp->numKeys);
- //assert (readpp->numKeys == 0); /* no longer true! */
}
/* set up the new blocks for simple writing */
- firstpp=isamd_pp_open(readpp->is,isamd_addr(0, diffcat));
+ /* firstpp=isamd_pp_open(readpp->is,isamd_addr(0, diffcat)); */
+ firstpp=isamd_pp_create(readpp->is, diffcat);
firstpp->pos=isamd_alloc_block(firstpp->is,diffcat);
if (readpp->is->method->debug >3)
logf(LOG_LOG,"isamd_merge: allocated new firstpp %d=%d:%d",
isamd_addr(firstpp->pos,firstpp->cat), firstpp->cat, firstpp->pos );
- pp=isamd_pp_open(readpp->is,isamd_addr(0,readpp->is->max_cat) );
+ pp=isamd_pp_create(readpp->is,readpp->is->max_cat );
pp->offset=pp->size=ISAMD_BLOCK_OFFSET_N;
while (r_more)
} /* while read */
-// firstpp->diffs=0;
-
-
isamd_reduceblock(pp); /* reduce size if possible */
if (0==firstpp->next)
firstpp->next = isamd_addr(pp->pos,pp->cat);
firstpp->size = firstpp->offset = ISAMD_BLOCK_OFFSET_1; /* nothing there */
memset(firstpp->buf,'\0',firstpp->is->method->filecat[firstpp->cat].bsize);
save_first_pp(firstpp);
- retval = isamd_addr(firstpp->pos, firstpp->cat);
+ retpos = isamd_addr(firstpp->pos, firstpp->cat);
isamd_pp_close(firstpp);
- return retval;
+ /* Create the dict entry */
+ /*!*/ /* it could be this could go in the dict as well, if there's */
+ /* been really many deletes. Somehow I suspect that is not the */
+ /* case. FIXME: Collect statistics and see if needed */
+ dictentry[0]=0; /* mark as a real isam */
+ memcpy(dictentry+1, &retpos, sizeof(ISAMD_P));
+ dictlen=sizeof(ISAMD_P)+1;
+ return dictlen;
} /* merge */
static int append_diffs(
ISAMD is,
- ISAMD_P ipos,
- /*ISAMD_I data)*/
+ char *dictentry, int dictlen,
FILTER filt)
{
+ ISAMD_P ipos;
struct it_key i_key; /* one input item */
char *i_item = (char *) &i_key; /* same as chars */
char *i_ptr=i_item;
char *c_ptr = codebuff;
int codelen;
int merge_rc;
- int retval=0;
+ ISAMD_P retpos;
+ int dsize;
- if (0==ipos)
+ if (0==dictlen)
{
- firstpp=isamd_pp_open(is, isamd_addr(0,0) );
+ firstpp=isamd_pp_create(is, 0 );
firstpp->size=firstpp->offset=ISAMD_BLOCK_OFFSET_1;
/* create in smallest category, will expand later */
- ++(is->files[0].no_fbuilds);
+ ++(is->no_fbuilds);
}
else
{
- firstpp=isamd_pp_open(is, ipos);
- ++(is->files[0].no_appds);
+ firstpp=isamd_pp_open(is, dictentry, dictlen);
+ if (dictentry[0] )
+ ipos=0;
+ else
+ memcpy(&ipos,dictentry+1,sizeof(ISAMD_P));
+ ++(is->no_appds);
}
if (is->method->debug >2)
- logf(LOG_LOG,"isamd_appd: Start ipos=%d=%d:%d n=%d=%d:%d nk=%d",
+ logf(LOG_LOG,"isamd_appd: Start ipos=%d=%d:%d n=%d=%d:%d nk=%d sz=%d",
ipos, isamd_type(ipos), isamd_block(ipos),
firstpp->next, isamd_type(firstpp->next), isamd_block(firstpp->next),
- firstpp->numKeys);
+ firstpp->numKeys, firstpp->size);
maxsize = is->method->filecat[firstpp->cat].bsize;
difflenidx = diffidx = firstpp->size;
diffidx+=sizeof(int); /* difflen will be stored here */
/* read first input */
- //i_ptr = i_item; //!!!
i_more = filter_read(filt, &i_key, &i_mode);
/* i_more = (*data->read_item)(data->clientData, &i_ptr, &i_mode); */
if (diffidx + codelen > maxsize )
{ /* block full */
- if (firstpp->cat < firstpp->is->max_cat)
- { /* just increase the block size */
+ while ( (firstpp->cat < firstpp->is->max_cat) &&
+ (diffidx + codelen > maxsize) )
+ { /* try to increase the block size */
if (firstpp->pos > 0) /* free the old block if allocated */
isamd_release_block(is, firstpp->cat, firstpp->pos);
++firstpp->cat;
maxsize = is->method->filecat[firstpp->cat].bsize;
firstpp->pos=0; /* need to allocate it when saving */
if (is->method->debug >3)
- logf(LOG_LOG,"isamd_appd: increased diff block to %d (%d)",
+ logf(LOG_LOG,"isamd_appd: increased diff block sz to %d (%d)",
firstpp->cat, maxsize);
}
- else
- { /* max size already - can't help, need to merge it */
+ if ((firstpp->cat >= firstpp->is->max_cat) &&
+ (diffidx + codelen > maxsize) )
+ { /* max size - can't help, need to merge it */
if (is->method->debug >7)
- logf(LOG_LOG,"isamd_appd: block full");
- if (is->method->debug >9) //!!!!!
+ logf(LOG_LOG,"isamd_appd: need to merge");
+ if (is->method->debug >9) /* !!!!! */
logf(LOG_LOG,"isamd_appd: going to merge with m=%d %d.%d",
i_mode, i_key.sysno, i_key.seqno);
- merge_rc = merge (firstpp, &i_key, filt);
+ merge_rc = merge (firstpp, &i_key, filt, dictentry, dictlen);
if (0!=merge_rc)
return merge_rc; /* merge handled them all ! */
assert(!"merge returned zero ??");
} /* need to merge */
} /* block full */
-
+
+ if (!( diffidx+codelen <= maxsize ))
+ { /* bug hunting */
+ logf(LOG_LOG,"OOPS, diffidx problem: d=%d c=%d s=%d > m=%d",
+ diffidx, codelen, diffidx+codelen, maxsize);
+ logf(LOG_LOG,"ipos=%d f=%d=%d:%d",
+ ipos,
+ isamd_addr(firstpp->pos, firstpp->cat),
+ firstpp->cat, firstpp->pos );
+ }
assert ( diffidx+codelen <= maxsize );
/* save the diff */
while ( (difflenidx-diffidx<=sizeof(int)+1) && (difflenidx<maxsize))
firstpp->buf[difflenidx++]='\0';
- if (0==firstpp->pos) /* need to (re)alloc the block */
- firstpp->pos = isamd_alloc_block(is, firstpp->cat);
+ if (firstpp->numKeys==0)
+ {
+ /* FIXME: Release blocks that may be allocated !!! */
+ return 0; /* don't bother storing this! */
+ }
- retval = save_first_pp( firstpp );
- isamd_pp_close(firstpp);
+ dsize=diffidx-ISAMD_BLOCK_OFFSET_1;
+ /* logf(LOG_LOG,"!! nxt=%d diffidx=%d ds=%d",
+ firstpp->next, diffidx, dsize); */
+
+ if ( (0==firstpp->next) && (dsize <ISAMD_MAX_DICT_LEN))
+ {
+ /* logf(LOG_LOG,"building a dict entry!!"); */
+ assert(firstpp->numKeys < 128);
+ assert(firstpp->numKeys >0);
+ /* actually, 255 is good enough, but sign mismatches... */
+ /* in real life, 4-5 is as much as we can hope for, as long */
+ /* as ISAMD_MAX_DICT_LEN is reasonably small (8) */
+ dictentry[0]=firstpp->numKeys;
+ memcpy(dictentry+1, firstpp->buf+ISAMD_BLOCK_OFFSET_1, dsize);
+ dictlen=dsize+1;
+ }
+ else
+ {
+ if (0==firstpp->pos) /* need to (re)alloc the block */
+ firstpp->pos = isamd_alloc_block(is, firstpp->cat);
+ retpos = save_first_pp( firstpp );
+ isamd_pp_close(firstpp);
+ dictentry[0]=0; /* mark as a real isam */
+ memcpy(dictentry+1, &retpos, sizeof(ISAMD_P));
+ dictlen=sizeof(ISAMD_P)+1;
+ }
- return retval;
+ return dictlen;
} /* append_diffs */
* isamd_append itself
*************************************************************/
-ISAMD_P isamd_append (ISAMD is, ISAMD_P ipos, ISAMD_I data)
+int isamd_append (ISAMD is, char *dictentry, int dictlen, ISAMD_I data)
+/*ISAMD_P isamd_append (ISAMD is, ISAMD_P ipos, ISAMD_I data) */
{
FILTER F = filter_open(is,data);
- return append_diffs(is,ipos,F);
+ int newlen=0;
+
+ if ( filter_isempty(F) ) /* can be, if del-ins of the same */
+ {
+ if (is->method->debug >3)
+ logf(LOG_LOG,"isamd_appd: nothing to do ");
+ filter_close(F);
+ ++(is->no_non);
+ return dictlen; /* without doing anything at all */
+ }
+
+#ifdef SKIPTHIS
+ /* The old way to handle singletons */
+ if ( ( 0==ipos) && filter_only_one(F) )
+ {
+ struct it_key k;
+ int mode;
+ filter_read(F,&k,&mode);
+ assert(mode);
+ rc = singleton_encode(&k);
+ if (!rc)
+ {
+ if (is->method->debug >9)
+ logf(LOG_LOG,"isamd_appd: singleton didn't fit, backfilling");
+ filter_backfill(F,&k, mode);
+ }
+ if (is->method->debug >9)
+ logf(LOG_LOG,"isamd_appd: singleton %d (%x)",
+ rc,rc);
+ if (rc)
+ is->no_singles++;
+ assert ( (rc==0) || is_singleton(rc) );
+ }
+ newlen = append_diffs(is,ipos,F);
+#endif
+ newlen = append_diffs(is,dictentry,dictlen,F);
filter_close(F);
+
+ if (is->method->debug >2)
+ logf(LOG_LOG,"isamd_appd: ret len=%d ", newlen);
+ return newlen;
} /* isamd_append */
/*
* $Log: merge-d.c,v $
- * Revision 1.21 1999-09-21 17:36:43 heikki
+ * Revision 1.30 2003-03-05 16:41:10 adam
+ * Fix GCC warnings
+ *
+ * Revision 1.29 2002/11/26 22:18:34 adam
+ * Remove // comments
+ *
+ * Revision 1.28 2002/08/02 19:26:56 adam
+ * Towards GPL
+ *
+ * Revision 1.27 2002/07/12 18:12:21 heikki
+ * Isam-D now stores small entries directly in the dictionary.
+ * Needs more tuning and cleaning...
+ *
+ * Revision 1.26 2002/07/11 16:16:00 heikki
+ * Fixed a bug in isamd, failed to store a single key when its bits
+ * did not fit into a singleton.
+ *
+ * Revision 1.25 1999/11/30 13:48:04 adam
+ * Improved installation. Updated for inclusion of YAZ header files.
+ *
+ * Revision 1.24 1999/10/05 09:57:40 heikki
+ * Tuning the isam-d (and fixed a small "detail")
+ *
+ * Revision 1.23 1999/09/27 14:36:36 heikki
+ * singletons
+ *
+ * Revision 1.22 1999/09/23 18:01:18 heikki
+ * singleton optimising
+ *
+ * Revision 1.21 1999/09/21 17:36:43 heikki
* Added filter function. Not much of effect on the small test set...
*
* Revision 1.20 1999/09/20 15:48:06 heikki