2 * Copyright (C) 1994-1995, Index Data I/S
4 * Sebastian Hammer, Adam Dickmeiss
7 * Revision 1.31 1995-11-24 11:31:35 adam
8 * Commands add & del read filenames from stdin if source directory is
10 * Match criteria supports 'constant' strings.
12 * Revision 1.30 1995/11/22 17:19:16 adam
13 * Record management uses the bfile system.
15 * Revision 1.29 1995/11/21 15:01:14 adam
16 * New general match criteria implemented.
17 * New feature: document groups.
19 * Revision 1.28 1995/11/21 09:20:30 adam
20 * Yet more work on record match.
22 * Revision 1.27 1995/11/20 16:59:45 adam
23 * New update method: the 'old' keys are saved for each records.
25 * Revision 1.26 1995/11/20 11:56:24 adam
26 * Work on new traversal.
28 * Revision 1.25 1995/11/16 15:34:54 adam
29 * Uses new record management system in both indexer and server.
31 * Revision 1.24 1995/11/15 19:13:08 adam
32 * Work on record management.
34 * Revision 1.23 1995/10/27 14:00:10 adam
35 * Implemented detection of database availability.
37 * Revision 1.22 1995/10/17 18:02:07 adam
38 * New feature: databases. Implemented as prefix to words in dictionary.
40 * Revision 1.21 1995/10/10 12:24:38 adam
41 * Temporary sort files are compressed.
43 * Revision 1.20 1995/10/06 13:52:05 adam
44 * Bug fixes. Handler may abort further scanning.
46 * Revision 1.19 1995/10/04 12:55:16 adam
47 * Bug fix in ranked search. Use=Any keys inserted.
49 * Revision 1.18 1995/10/04 09:37:08 quinn
52 * Revision 1.17 1995/10/03 14:28:57 adam
53 * Buffered read in extract works.
55 * Revision 1.16 1995/10/03 14:28:45 adam
56 * Work on more effecient read handler in extract.
58 * Revision 1.15 1995/10/02 15:42:53 adam
59 * Extract uses file descriptors instead of FILE pointers.
61 * Revision 1.14 1995/10/02 15:29:13 adam
62 * More logging in file_extract.
64 * Revision 1.13 1995/09/29 14:01:39 adam
67 * Revision 1.12 1995/09/28 14:22:56 adam
68 * Sort uses smaller temporary files.
70 * Revision 1.11 1995/09/28 12:10:31 adam
71 * Bug fixes. Field prefix used in queries.
73 * Revision 1.10 1995/09/28 09:19:41 adam
74 * xfree/xmalloc used everywhere.
75 * Extract/retrieve method seems to work for text records.
77 * Revision 1.9 1995/09/27 12:22:28 adam
78 * More work on extract in record control.
79 * Field name is not in isam keys but in prefix in dictionary words.
81 * Revision 1.8 1995/09/14 07:48:22 adam
82 * Record control management.
84 * Revision 1.7 1995/09/11 13:09:32 adam
85 * More work on relevance feedback.
87 * Revision 1.6 1995/09/08 14:52:27 adam
88 * Minor changes. Dictionary is lower case now.
90 * Revision 1.5 1995/09/06 16:11:16 adam
91 * Option: only one word key per file.
93 * Revision 1.4 1995/09/05 15:28:39 adam
94 * More work on search engine.
96 * Revision 1.3 1995/09/04 12:33:41 adam
97 * Various cleanup. YAZ util used instead.
99 * Revision 1.2 1995/09/04 09:10:34 adam
100 * More work on index add/del/update.
101 * Merge sort implemented.
102 * Initial work on z39 server.
104 * Revision 1.1 1995/09/01 14:06:35 adam
105 * Split of work into more files.
114 #include <alexutil.h>
118 #include "recindex.h"
120 static Dict matchDict;
122 static Records records = NULL;
124 static char **key_buf;
125 static size_t ptr_top;
127 static size_t key_buf_used;
128 static int key_file_no;
130 static int records_inserted = 0;
131 static int records_updated = 0;
132 static int records_deleted = 0;
134 #define MATCH_DICT "match"
136 void key_open (int mem)
140 key_buf = xmalloc (mem);
141 ptr_top = mem/sizeof(char*);
147 if (!(matchDict = dict_open (MATCH_DICT, 20, 1)))
149 logf (LOG_FATAL, "dict_open fail of %s", MATCH_DICT);
153 records = rec_open (1);
162 void encode_key_init (struct encode_info *i)
168 char *encode_key_int (int d, char *bp)
177 else if (d <= 4194303)
179 *bp++ = 128 + (d>>16);
180 *bp++ = (d>>8) & 255;
185 *bp++ = 192 + (d>>24);
186 *bp++ = (d>>16) & 255;
187 *bp++ = (d>>8) & 255;
193 void encode_key_write (char *k, struct encode_info *i, FILE *outf)
198 while ((*bp++ = *k++))
200 memcpy (&key, k+1, sizeof(struct it_key));
201 bp = encode_key_int ( (key.sysno - i->sysno) * 2 + *k, bp);
202 if (i->sysno != key.sysno)
204 i->sysno = key.sysno;
207 bp = encode_key_int (key.seqno - i->seqno, bp);
208 i->seqno = key.seqno;
209 if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)
211 logf (LOG_FATAL|LOG_ERRNO, "fwrite");
216 void key_flush (void)
221 struct encode_info encode_info;
227 logf (LOG_LOG, "sorting section %d", key_file_no);
228 qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_qsort_compare);
229 sprintf (out_fname, TEMP_FNAME, key_file_no);
231 if (!(outf = fopen (out_fname, "w")))
233 logf (LOG_FATAL|LOG_ERRNO, "fopen (4) %s", out_fname);
236 logf (LOG_LOG, "writing section %d", key_file_no);
237 prevcp = cp = key_buf[ptr_top-ptr_i];
239 encode_key_init (&encode_info);
240 encode_key_write (cp, &encode_info, outf);
243 cp = key_buf[ptr_top-ptr_i];
244 if (strcmp (cp, prevcp))
246 encode_key_init (&encode_info);
247 encode_key_write (cp, &encode_info, outf);
251 encode_key_write (cp + strlen(cp), &encode_info, outf);
255 logf (LOG_FATAL|LOG_ERRNO, "fclose %s", out_fname);
258 logf (LOG_LOG, "finished section %d", key_file_no);
267 rec_close (&records);
268 dict_close (matchDict);
270 logf (LOG_LOG, "Records inserted %6d", records_inserted);
271 logf (LOG_LOG, "Records updated %6d", records_updated);
272 logf (LOG_LOG, "Records deleted %6d", records_deleted);
276 static void wordInit (RecWord *p)
280 p->which = Word_String;
289 static void addRecordKey (const RecWord *p)
296 if (reckeys.buf_used+1024 > reckeys.buf_max)
300 b = malloc (reckeys.buf_max += 65000);
301 if (reckeys.buf_used > 0)
302 memcpy (b, reckeys.buf, reckeys.buf_used);
306 dst = reckeys.buf + reckeys.buf_used;
310 attrSet = p->attrSet;
311 memcpy (dst, &attrSet, sizeof(attrSet));
312 dst += sizeof(attrSet);
314 attrUse = p->attrUse;
315 memcpy (dst, &attrUse, sizeof(attrUse));
316 dst += sizeof(attrUse);
318 for (i = 0; p->u.string[i]; i++)
319 *dst++ = p->u.string[i];
322 memcpy (dst, &p->seqno, sizeof(p->seqno));
323 dst += sizeof(p->seqno);
329 reckeys.buf_used = dst - reckeys.buf;
332 static void flushRecordKeys (SYSNO sysno, int cmd, struct recKeys *reckeys,
333 const char *databaseName)
336 while (off < reckeys->buf_used)
338 const char *src = reckeys->buf + off;
343 memcpy (&attrSet, src, sizeof(attrSet));
344 src += sizeof(attrSet);
346 memcpy (&attrUse, src, sizeof(attrUse));
347 src += sizeof(attrUse);
349 if (key_buf_used + 1024 > (ptr_top-ptr_i)*sizeof(char*))
352 key_buf[ptr_top-ptr_i] = (char*)key_buf + key_buf_used;
353 key_buf_used += index_word_prefix ((char*)key_buf + key_buf_used,
354 attrSet, attrUse, databaseName);
356 ((char*)key_buf) [key_buf_used++] = index_char_cvt (*src++);
358 ((char*)key_buf) [key_buf_used++] = '\0';
360 ((char*) key_buf)[key_buf_used++] = cmd;
362 memcpy (&key.seqno, src, sizeof(key.seqno));
363 src += sizeof(key.seqno);
365 memcpy ((char*)key_buf + key_buf_used, &key, sizeof(key));
366 key_buf_used += sizeof(key);
367 off = src - reckeys->buf;
369 assert (off == reckeys->buf_used);
372 static const char **searchRecordKey (struct recKeys *reckeys,
373 int attrSetS, int attrUseS)
375 static const char *ws[32];
380 for (i = 0; i<32; i++)
383 while (off < reckeys->buf_used)
385 const char *src = reckeys->buf + off;
391 memcpy (&attrSet, src, sizeof(attrSet));
392 src += sizeof(attrSet);
394 memcpy (&attrUse, src, sizeof(attrUse));
395 src += sizeof(attrUse);
401 memcpy (&seqno, src, sizeof(seqno));
402 src += sizeof(seqno);
405 logf (LOG_LOG, "(%d,%d) %d %s", attrSet, attrUse, seqno, wstart);
407 if (attrUseS == attrUse && attrSetS == attrSet)
414 woff = seqno - startSeq;
415 if (woff >= 0 && woff < 31)
419 off = src - reckeys->buf;
421 assert (off == reckeys->buf_used);
425 static void addRecordKeyAny (const RecWord *p)
427 if (p->attrSet != 1 || p->attrUse != 1016)
431 memcpy (&w, p, sizeof(w));
439 static char *file_buf;
440 static int file_offset;
441 static int file_bufsize;
443 static void file_read_start (int fd)
446 file_buf = xmalloc (4096);
447 file_bufsize = read (fd, file_buf, 4096);
450 static void file_read_stop (int fd)
455 static int file_read (int fd, char *buf, size_t count)
457 int l = file_bufsize - file_offset;
463 memcpy (buf, file_buf + file_offset, l);
465 if (count > file_bufsize)
467 if ((r = read (fd, buf + l, count)) == -1)
469 logf (LOG_FATAL|LOG_ERRNO, "read");
476 file_bufsize = r = read (fd, file_buf, 4096);
479 logf (LOG_FATAL|LOG_ERRNO, "read");
485 memcpy (buf + l, file_buf, r);
491 memcpy (buf + l, file_buf, count - l);
495 memcpy (buf, file_buf + file_offset, count);
496 file_offset += count;
500 static int atois (const char **s)
503 while ( (c=**s) >= '0' && c <= '9')
505 val = val*10 + c - '0';
511 static char *fileMatchStr (struct recKeys *reckeys, struct recordGroup *rGroup,
513 const char *recordType,
516 static char dstBuf[2048];
518 const char *s = spec;
519 static const char **w;
524 while (*s == ' ' || *s == '\t')
531 int attrSet, attrUse;
535 attrSet = atois (&s);
538 logf (LOG_WARN, "Missing , in match criteria %s in group %s",
539 spec, rGroup->groupName ? rGroup->groupName : "none");
543 attrUse = atois (&s);
544 w = searchRecordKey (reckeys, attrSet, attrUse);
549 for (i = 0; i<32; i++)
554 logf (LOG_WARN, "Missing ) in match criteria %s in group %s",
555 spec, rGroup->groupName ? rGroup->groupName : "none");
560 for (i = 0; i<32; i++)
561 if (matchFlag[i] && w[i])
573 logf (LOG_WARN, "Record in file %s didn't contain match"
574 " fields in (%d,%d)", fname, attrSet, attrUse);
582 const char *spec_src = NULL;
583 const char *s1 = ++s;
584 while (*s1 && *s1 != ' ' && *s1 != '\t')
590 memcpy (special, s, spec_len);
591 special[spec_len] = '\0';
594 if (strcmp (special, "group"))
595 spec_src = rGroup->groupName;
596 else if (strcmp (special, "database"))
597 spec_src = rGroup->databaseName;
598 else if (strcmp (special, "filename"))
600 else if (strcmp (special, "type"))
601 spec_src = recordType;
606 strcpy (dst, spec_src);
607 dst += strlen (spec_src);
610 else if (*s == '\"' || *s == '\'')
612 int stopMarker = *s++;
616 while (*s && *s != stopMarker)
624 strcpy (dst, tmpString);
625 dst += strlen (tmpString);
629 logf (LOG_WARN, "Syntax error in match criteria %s in group %s",
630 spec, rGroup->groupName ? rGroup->groupName : "none");
637 logf (LOG_WARN, "No match criteria for record %s in group %s",
638 fname, rGroup->groupName ? rGroup->groupName : "none");
644 int fileExtract (SYSNO *sysno, const char *fname, struct recordGroup *rGroup,
652 const char *file_type;
653 const char *file_match;
654 struct recExtractCtrl extractCtrl;
659 if (!rGroup->groupName || !*rGroup->groupName)
662 sprintf (gprefix, "%s.", rGroup->groupName);
664 logf (LOG_DEBUG, "fileExtract %s", fname);
666 /* determine file extension */
667 for (i = strlen(fname); --i >= 0; )
673 else if (fname[i] == '.')
675 strcpy (ext, fname+i+1);
678 /* determine file type - depending on extension */
679 sprintf (ext_res, "%sfileExtension.%s", gprefix, ext);
680 if (!(file_type = res_get (common_resource, ext_res)))
682 if (!(recType = recType_byName (file_type)))
685 /* determine match criteria */
686 sprintf (ext_res, "%sfileMatch.%s", gprefix, ext);
687 file_match = res_get (common_resource, ext_res);
690 sprintf (ext_res, "%sfileMatch", gprefix);
691 file_match = res_get (common_resource, ext_res);
694 /* determine database name */
695 if (!rGroup->databaseName)
697 sprintf (ext_res, "%sdatabase.%s", gprefix, ext);
698 if (!(rGroup->databaseName = res_get (common_resource, ext_res)))
700 sprintf (ext_res, "%sdatabase", gprefix);
701 rGroup->databaseName = res_get (common_resource, ext_res);
704 if (!rGroup->databaseName)
705 rGroup->databaseName = "Default";
707 /* open input file */
708 if ((extractCtrl.fd = open (fname, O_RDONLY)) == -1)
710 logf (LOG_WARN|LOG_ERRNO, "open %s", fname);
715 extractCtrl.subType = "";
716 extractCtrl.init = wordInit;
717 extractCtrl.add = addRecordKeyAny;
719 reckeys.buf_used = 0;
720 file_read_start (extractCtrl.fd);
721 extractCtrl.readf = file_read;
722 r = (*recType->extract)(&extractCtrl);
723 file_read_stop (extractCtrl.fd);
724 close (extractCtrl.fd);
728 logf (LOG_WARN, "Couldn't extract file %s, code %d", fname, r);
732 /* perform match if sysno not known and if match criteria is specified */
743 matchStr = fileMatchStr(&reckeys, rGroup, fname, file_type,
747 rinfo = dict_lookup (matchDict, matchStr);
749 memcpy (sysno, rinfo+1, sizeof(*sysno));
753 logf (LOG_WARN, "Record not inserted");
764 logf (LOG_LOG, "? record %s", fname);
767 logf (LOG_LOG, "add record %s", fname);
768 rec = rec_new (records);
772 dict_insert (matchDict, matchStr, sizeof(*sysno), sysno);
773 flushRecordKeys (*sysno, 1, &reckeys, rGroup->databaseName);
779 struct recKeys delkeys;
781 rec = rec_get (records, *sysno);
783 delkeys.buf_used = rec->size[2];
784 delkeys.buf = rec->info[2];
785 flushRecordKeys (*sysno, 0, &delkeys, rec->info[3]);
788 logf (LOG_LOG, "delete record %s", fname);
790 rec_del (records, &rec);
795 logf (LOG_LOG, "update record %s", fname);
796 flushRecordKeys (*sysno, 1, &reckeys, rGroup->databaseName);
801 rec->info[0] = rec_strdup (file_type, &rec->size[0]);
804 rec->info[1] = rec_strdup (fname, &rec->size[1]);
807 if (reckeys.buf_used > 0)
809 rec->info[2] = malloc (reckeys.buf_used);
810 rec->size[2] = reckeys.buf_used;
811 memcpy (rec->info[2], reckeys.buf, rec->size[2]);
819 rec->info[3] = rec_strdup (rGroup->databaseName, &rec->size[3]);
821 rec_put (records, &rec);