1 /* $Id: extract.c,v 1.149 2004-01-22 15:40:25 heikki Exp $
2 Copyright (C) 1995,1996,1997,1998,1999,2000,2001,2002,2003,2004
5 This file is part of the Zebra server.
7 Zebra is free software; you can redistribute it and/or modify it under
8 the terms of the GNU General Public License as published by the Free
9 Software Foundation; either version 2, or (at your option) any later
12 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 You should have received a copy of the GNU General Public License
18 along with Zebra; see the file LICENSE.zebra. If not, write to the
19 Free Software Foundation, 59 Temple Place - Suite 330, Boston, MA
37 #if _FILE_OFFSET_BITS == 64
38 #define PRINTF_OFF_T "%Ld"
40 #define PRINTF_OFF_T "%ld"
43 #define USE_SHELLSORT 0
46 static void shellsort(void *ar, int r, size_t s,
47 int (*cmp)(const void *a, const void *b))
52 static const int incs[16] = { 1391376, 463792, 198768, 86961, 33936,
53 13776, 4592, 1968, 861, 336,
54 112, 48, 21, 7, 3, 1 };
55 for ( k = 0; k < 16; k++)
56 for (h = incs[k], i = h; i < r; i++)
60 while (j > h && (*cmp)(a + s*(j-h), v) > 0)
62 memcpy (a + s*j, a + s*(j-h), s);
70 static void logRecord (ZebraHandle zh)
72 ++zh->records_processed;
73 if (!(zh->records_processed % 1000))
75 logf (LOG_LOG, "Records: %7d i/u/d %d/%d/%d",
76 zh->records_processed, zh->records_inserted, zh->records_updated,
81 static void extract_init (struct recExtractCtrl *p, RecWord *w)
83 w->zebra_maps = p->zebra_maps;
85 w->attrSet = VAL_BIB1;
91 static const char **searchRecordKey (ZebraHandle zh,
92 struct recKeys *reckeys,
93 int attrSetS, int attrUseS)
95 static const char *ws[32];
107 for (i = 0; i<32; i++)
111 chS = zebraExplain_lookupSU (zh->reg->zei, attrSetS, attrUseS);
115 while (off < reckeys->buf_used)
118 const char *src = reckeys->buf + off;
126 memcpy (&ch, src, sizeof(ch));
132 memcpy (&attrSet, src, sizeof(attrSet));
133 src += sizeof(attrSet);
137 memcpy (&attrUse, src, sizeof(attrUse));
138 src += sizeof(attrUse);
145 seqno += ((lead>>2) & 15)-1;
148 memcpy (&seqno, src, sizeof(seqno));
149 src += sizeof(seqno);
155 attrUseS == attrUse && attrSetS == attrSet
164 woff = seqno - startSeq;
165 if (woff >= 0 && woff < 31)
169 off = src - reckeys->buf;
171 assert (off == reckeys->buf_used);
175 struct file_read_info {
176 off_t file_max; /* maximum offset so far */
177 off_t file_offset; /* current offset */
178 off_t file_moffset; /* offset of rec/rec boundary */
185 static struct file_read_info *file_read_start (int fd)
187 struct file_read_info *fi = (struct file_read_info *)
188 xmalloc (sizeof(*fi));
192 fi->file_moffset = 0;
198 static void file_read_stop (struct file_read_info *fi)
203 static off_t file_seek (void *handle, off_t offset)
205 struct file_read_info *p = (struct file_read_info *) handle;
206 p->file_offset = offset;
209 return lseek (p->fd, offset, SEEK_SET);
212 static off_t file_tell (void *handle)
214 struct file_read_info *p = (struct file_read_info *) handle;
215 return p->file_offset;
218 static int file_read (void *handle, char *buf, size_t count)
220 struct file_read_info *p = (struct file_read_info *) handle;
226 if (r > p->sdrmax - p->file_offset)
227 r = p->sdrmax - p->file_offset;
229 memcpy (buf, p->sdrbuf + p->file_offset, r);
232 r = read (fd, buf, count);
236 if (p->file_offset > p->file_max)
237 p->file_max = p->file_offset;
242 static void file_begin (void *handle)
244 struct file_read_info *p = (struct file_read_info *) handle;
246 p->file_offset = p->file_moffset;
247 if (!p->sdrbuf && p->file_moffset)
248 lseek (p->fd, p->file_moffset, SEEK_SET);
252 static void file_end (void *handle, off_t offset)
254 struct file_read_info *p = (struct file_read_info *) handle;
256 assert (p->file_more == 0);
258 p->file_moffset = offset;
261 static char *fileMatchStr (ZebraHandle zh,
262 struct recKeys *reckeys,
263 const char *fname, const char *spec)
265 static char dstBuf[2048]; /* static here ??? */
267 const char *s = spec;
268 static const char **w;
272 while (*s == ' ' || *s == '\t')
278 char attset_str[64], attname_str[64];
279 data1_attset *attset;
282 int attSet = 1, attUse = 1;
286 for (i = 0; *s && *s != ',' && *s != ')'; s++)
288 attset_str[i++] = *s;
289 attset_str[i] = '\0';
294 for (i = 0; *s && *s != ')'; s++)
296 attname_str[i++] = *s;
297 attname_str[i] = '\0';
300 if ((attset = data1_get_attset (zh->reg->dh, attset_str)))
303 attSet = attset->reference;
304 att = data1_getattbyname(zh->reg->dh, attset, attname_str);
308 attUse = atoi (attname_str);
310 w = searchRecordKey (zh, reckeys, attSet, attUse);
315 for (i = 0; i<32; i++)
320 logf (LOG_WARN, "Missing ) in match criteria %s in group %s",
321 spec, zh->m_group ? zh->m_group : "none");
326 for (i = 0; i<32; i++)
327 if (matchFlag[i] && w[i])
339 logf (LOG_WARN, "Record didn't contain match"
340 " fields in (%s,%s)", attset_str, attname_str);
348 const char *spec_src = NULL;
349 const char *s1 = ++s;
350 while (*s1 && *s1 != ' ' && *s1 != '\t')
356 memcpy (special, s, spec_len);
357 special[spec_len] = '\0';
360 if (!strcmp (special, "group"))
361 spec_src = zh->m_group;
362 else if (!strcmp (special, "database"))
363 spec_src = zh->basenames[0];
364 else if (!strcmp (special, "filename")) {
367 else if (!strcmp (special, "type"))
368 spec_src = zh->m_record_type;
373 strcpy (dst, spec_src);
374 dst += strlen (spec_src);
377 else if (*s == '\"' || *s == '\'')
379 int stopMarker = *s++;
383 while (*s && *s != stopMarker)
386 tmpString[i++] = *s++;
391 strcpy (dst, tmpString);
392 dst += strlen (tmpString);
396 logf (LOG_WARN, "Syntax error in match criteria %s in group %s",
397 spec, zh->m_group ? zh->m_group : "none");
404 logf (LOG_WARN, "No match criteria for record %s in group %s",
405 fname, zh->m_group ? zh->m_group : "none");
412 struct recordLogInfo {
415 struct recordGroup *rGroup;
418 static int file_extract_record(ZebraHandle zh,
419 SYSNO *sysno, const char *fname,
421 struct file_read_info *fi,
424 RecordAttr *recordAttr;
426 const char *matchStr;
429 off_t recordOffset = 0;
435 recType_byName (zh->reg->recTypes, zh->m_record_type, subType,
438 logf (LOG_WARN, "No such record type: %s", zh->m_record_type);
442 /* announce database */
443 if (zebraExplain_curDatabase (zh->reg->zei, zh->basenames[0]))
445 if (zebraExplain_newDatabase (zh->reg->zei, zh->basenames[0],
446 zh->m_explain_database))
452 struct recExtractCtrl extractCtrl;
454 /* we are going to read from a file, so prepare the extraction */
457 zh->reg->keys.buf_used = 0;
458 zh->reg->keys.prevAttrUse = -1;
459 zh->reg->keys.prevAttrSet = -1;
460 zh->reg->keys.prevSeqNo = 0;
461 zh->reg->sortKeys.buf_used = 0;
463 recordOffset = fi->file_moffset;
464 extractCtrl.offset = fi->file_moffset;
465 extractCtrl.readf = file_read;
466 extractCtrl.seekf = file_seek;
467 extractCtrl.tellf = file_tell;
468 extractCtrl.endf = file_end;
470 extractCtrl.subType = subType;
471 extractCtrl.init = extract_init;
472 extractCtrl.tokenAdd = extract_token_add;
473 extractCtrl.schemaAdd = extract_schema_add;
474 extractCtrl.dh = zh->reg->dh;
475 extractCtrl.handle = zh;
476 for (i = 0; i<256; i++)
478 if (zebra_maps_is_positioned(zh->reg->zebra_maps, i))
479 extractCtrl.seqno[i] = 1;
481 extractCtrl.seqno[i] = 0;
483 extractCtrl.zebra_maps = zh->reg->zebra_maps;
484 extractCtrl.flagShowRecords = !zh->m_flag_rw;
487 printf ("File: %s " PRINTF_OFF_T "\n", fname, recordOffset);
491 sprintf (msg, "%s:" PRINTF_OFF_T , fname, recordOffset);
492 yaz_log_init_prefix2 (msg);
495 r = (*recType->extract)(clientData, &extractCtrl);
497 yaz_log_init_prefix2 (0);
498 if (r == RECCTRL_EXTRACT_EOF)
500 else if (r == RECCTRL_EXTRACT_ERROR_GENERIC)
502 /* error occured during extraction ... */
504 zh->records_processed < zh->m_file_verbose_limit)
506 logf (LOG_WARN, "fail %s %s " PRINTF_OFF_T, zh->m_record_type,
507 fname, recordOffset);
511 else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER)
513 /* error occured during extraction ... */
515 zh->records_processed < zh->m_file_verbose_limit)
517 logf (LOG_WARN, "no filter for %s %s "
518 PRINTF_OFF_T, zh->m_record_type,
519 fname, recordOffset);
523 if (zh->reg->keys.buf_used == 0)
525 /* the extraction process returned no information - the record
526 is probably empty - unless flagShowRecords is in use */
530 logf (LOG_WARN, "empty %s %s " PRINTF_OFF_T, zh->m_record_type,
531 fname, recordOffset);
536 /* perform match if sysno not known and if match criteria is specified */
543 if (zh->m_record_id && *zh->m_record_id)
547 matchStr = fileMatchStr (zh, &zh->reg->keys, fname,
551 rinfo = dict_lookup (zh->reg->matchDict, matchStr);
553 memcpy (sysno, rinfo+1, sizeof(*sysno));
557 logf (LOG_WARN, "Bad match criteria");
568 logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T, zh->m_record_type,
569 fname, recordOffset);
570 logf (LOG_WARN, "cannot delete record above (seems new)");
573 if (zh->records_processed < zh->m_file_verbose_limit)
574 logf (LOG_LOG, "add %s %s " PRINTF_OFF_T, zh->m_record_type,
575 fname, recordOffset);
576 rec = rec_new (zh->reg->records);
580 recordAttr = rec_init_attr (zh->reg->zei, rec);
584 dict_insert (zh->reg->matchDict, matchStr, sizeof(*sysno), sysno);
586 extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys);
587 extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys);
589 zh->records_inserted++;
593 /* record already exists */
594 struct recKeys delkeys;
595 struct sortKeys sortKeys;
597 rec = rec_get (zh->reg->records, *sysno);
600 recordAttr = rec_init_attr (zh->reg->zei, rec);
602 if (!force_update && recordAttr->runNumber ==
603 zebraExplain_runNumberIncrement (zh->reg->zei, 0))
605 yaz_log (LOG_LOG, "run number = %d", recordAttr->runNumber);
606 yaz_log (LOG_LOG, "skipped %s %s " PRINTF_OFF_T,
607 zh->m_record_type, fname, recordOffset);
608 extract_flushSortKeys (zh, *sysno, -1, &zh->reg->sortKeys);
613 delkeys.buf_used = rec->size[recInfo_delKeys];
614 delkeys.buf = rec->info[recInfo_delKeys];
616 sortKeys.buf_used = rec->size[recInfo_sortKeys];
617 sortKeys.buf = rec->info[recInfo_sortKeys];
619 extract_flushSortKeys (zh, *sysno, 0, &sortKeys);
620 extract_flushRecordKeys (zh, *sysno, 0, &delkeys);
623 /* record going to be deleted */
624 if (!delkeys.buf_used)
626 logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T,
627 zh->m_record_type, fname, recordOffset);
628 logf (LOG_WARN, "cannot delete file above, storeKeys false");
632 if (zh->records_processed < zh->m_file_verbose_limit)
633 logf (LOG_LOG, "delete %s %s " PRINTF_OFF_T,
634 zh->m_record_type, fname, recordOffset);
635 zh->records_deleted++;
637 dict_delete (zh->reg->matchDict, matchStr);
638 rec_del (zh->reg->records, &rec);
646 /* record going to be updated */
647 if (!delkeys.buf_used)
649 logf (LOG_LOG, "update %s %s " PRINTF_OFF_T,
650 zh->m_record_type, fname, recordOffset);
651 logf (LOG_WARN, "cannot update file above, storeKeys false");
655 if (zh->records_processed < zh->m_file_verbose_limit)
656 logf (LOG_LOG, "update %s %s " PRINTF_OFF_T,
657 zh->m_record_type, fname, recordOffset);
658 extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys);
659 extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys);
660 zh->records_updated++;
664 /* update file type */
665 xfree (rec->info[recInfo_fileType]);
666 rec->info[recInfo_fileType] =
667 rec_strdup (zh->m_record_type, &rec->size[recInfo_fileType]);
669 /* update filename */
670 xfree (rec->info[recInfo_filename]);
671 rec->info[recInfo_filename] =
672 rec_strdup (fname, &rec->size[recInfo_filename]);
674 /* update delete keys */
675 xfree (rec->info[recInfo_delKeys]);
676 if (zh->reg->keys.buf_used > 0 && zh->m_store_keys == 1)
678 rec->size[recInfo_delKeys] = zh->reg->keys.buf_used;
679 rec->info[recInfo_delKeys] = zh->reg->keys.buf;
680 zh->reg->keys.buf = NULL;
681 zh->reg->keys.buf_max = 0;
685 rec->info[recInfo_delKeys] = NULL;
686 rec->size[recInfo_delKeys] = 0;
689 /* update sort keys */
690 xfree (rec->info[recInfo_sortKeys]);
692 rec->size[recInfo_sortKeys] = zh->reg->sortKeys.buf_used;
693 rec->info[recInfo_sortKeys] = zh->reg->sortKeys.buf;
694 zh->reg->sortKeys.buf = NULL;
695 zh->reg->sortKeys.buf_max = 0;
697 /* save file size of original record */
698 zebraExplain_recordBytesIncrement (zh->reg->zei,
699 - recordAttr->recordSize);
700 recordAttr->recordSize = fi->file_moffset - recordOffset;
701 if (!recordAttr->recordSize)
702 recordAttr->recordSize = fi->file_max - recordOffset;
703 zebraExplain_recordBytesIncrement (zh->reg->zei,
704 recordAttr->recordSize);
706 /* set run-number for this record */
707 recordAttr->runNumber = zebraExplain_runNumberIncrement (zh->reg->zei,
710 /* update store data */
711 xfree (rec->info[recInfo_storeData]);
712 if (zh->m_store_data)
714 rec->size[recInfo_storeData] = recordAttr->recordSize;
715 rec->info[recInfo_storeData] = (char *)
716 xmalloc (recordAttr->recordSize);
717 if (lseek (fi->fd, recordOffset, SEEK_SET) < 0)
719 logf (LOG_ERRNO|LOG_FATAL, "seek to " PRINTF_OFF_T " in %s",
720 recordOffset, fname);
723 if (read (fi->fd, rec->info[recInfo_storeData], recordAttr->recordSize)
724 < recordAttr->recordSize)
726 logf (LOG_ERRNO|LOG_FATAL, "read %d bytes of %s",
727 recordAttr->recordSize, fname);
733 rec->info[recInfo_storeData] = NULL;
734 rec->size[recInfo_storeData] = 0;
736 /* update database name */
737 xfree (rec->info[recInfo_databaseName]);
738 rec->info[recInfo_databaseName] =
739 rec_strdup (zh->basenames[0], &rec->size[recInfo_databaseName]);
742 recordAttr->recordOffset = recordOffset;
744 /* commit this record */
745 rec_put (zh->reg->records, &rec);
750 int fileExtract (ZebraHandle zh, SYSNO *sysno, const char *fname,
757 struct file_read_info *fi;
759 if (!zh->m_group || !*zh->m_group)
762 sprintf (gprefix, "%s.", zh->m_group);
764 logf (LOG_DEBUG, "fileExtract %s", fname);
766 /* determine file extension */
768 for (i = strlen(fname); --i >= 0; )
771 else if (fname[i] == '.')
773 strcpy (ext, fname+i+1);
776 /* determine file type - depending on extension */
777 if (!zh->m_record_type)
779 sprintf (ext_res, "%srecordType.%s", gprefix, ext);
780 zh->m_record_type = res_get (zh->res, ext_res);
782 if (!zh->m_record_type)
784 if (zh->records_processed < zh->m_file_verbose_limit)
785 logf (LOG_LOG, "? %s", fname);
788 /* determine match criteria */
789 if (!zh->m_record_id)
791 sprintf (ext_res, "%srecordId.%s", gprefix, ext);
792 zh->m_record_id = res_get (zh->res, ext_res);
795 if (sysno && deleteFlag)
801 if (zh->path_reg && !yaz_is_abspath (fname))
803 strcpy (full_rep, zh->path_reg);
804 strcat (full_rep, "/");
805 strcat (full_rep, fname);
808 strcpy (full_rep, fname);
811 if ((fd = open (full_rep, O_BINARY|O_RDONLY)) == -1)
813 logf (LOG_WARN|LOG_ERRNO, "open %s", full_rep);
817 fi = file_read_start (fd);
821 r = file_extract_record (zh, sysno, fname, deleteFlag, fi, 1);
822 } while (r && !sysno && fi->file_more);
830 If sysno is provided, then it's used to identify the reocord.
831 If not, and match_criteria is provided, then sysno is guessed
832 If not, and a record is provided, then sysno is got from there
835 int buffer_extract_record (ZebraHandle zh,
836 const char *buf, size_t buf_size,
839 const char *recordType,
841 const char *match_criteria,
846 RecordAttr *recordAttr;
847 struct recExtractCtrl extractCtrl;
849 const char *matchStr = 0;
850 RecType recType = NULL;
854 long recordOffset = 0;
855 struct zebra_fetch_control fc;
856 const char *pr_fname = fname; /* filename to print .. */
859 pr_fname = "<no file>"; /* make it printable if file is omitted */
862 fc.record_int_buf = buf;
863 fc.record_int_len = buf_size;
864 fc.record_int_pos = 0;
866 fc.record_offset = 0;
868 extractCtrl.offset = 0;
869 extractCtrl.readf = zebra_record_int_read;
870 extractCtrl.seekf = zebra_record_int_seek;
871 extractCtrl.tellf = zebra_record_int_tell;
872 extractCtrl.endf = zebra_record_int_end;
873 extractCtrl.fh = &fc;
875 zh->reg->keys.buf_used = 0;
876 zh->reg->keys.prevAttrUse = -1;
877 zh->reg->keys.prevAttrSet = -1;
878 zh->reg->keys.prevSeqNo = 0;
879 zh->reg->sortKeys.buf_used = 0;
881 if (zebraExplain_curDatabase (zh->reg->zei, zh->basenames[0]))
883 if (zebraExplain_newDatabase (zh->reg->zei, zh->basenames[0],
884 zh->m_explain_database))
888 if (recordType && *recordType) {
889 logf (LOG_DEBUG, "Record type explicitly specified: %s", recordType);
890 recType = recType_byName (zh->reg->recTypes, recordType, subType,
893 if (!(zh->m_record_type)) {
894 logf (LOG_WARN, "No such record type defined");
897 logf (LOG_DEBUG, "Get record type from rgroup: %s",zh->m_record_type);
898 recType = recType_byName (zh->reg->recTypes, zh->m_record_type, subType,
900 recordType = zh->m_record_type;
904 logf (LOG_WARN, "No such record type: %s", zh->m_record_type);
908 extractCtrl.subType = subType;
909 extractCtrl.init = extract_init;
910 extractCtrl.tokenAdd = extract_token_add;
911 extractCtrl.schemaAdd = extract_schema_add;
912 extractCtrl.dh = zh->reg->dh;
913 extractCtrl.handle = zh;
914 extractCtrl.zebra_maps = zh->reg->zebra_maps;
915 extractCtrl.flagShowRecords = 0;
916 for (i = 0; i<256; i++)
918 if (zebra_maps_is_positioned(zh->reg->zebra_maps, i))
919 extractCtrl.seqno[i] = 1;
921 extractCtrl.seqno[i] = 0;
924 r = (*recType->extract)(clientData, &extractCtrl);
926 if (r == RECCTRL_EXTRACT_EOF)
928 else if (r == RECCTRL_EXTRACT_ERROR_GENERIC)
930 /* error occured during extraction ... */
931 yaz_log (LOG_WARN, "extract error: generic");
934 else if (r == RECCTRL_EXTRACT_ERROR_NO_SUCH_FILTER)
936 /* error occured during extraction ... */
937 yaz_log (LOG_WARN, "extract error: no such filter");
940 if (zh->reg->keys.buf_used == 0)
942 /* the extraction process returned no information - the record
943 is probably empty - unless flagShowRecords is in use */
946 logf (LOG_WARN, "No keys generated for record");
947 logf (LOG_WARN, " The file is probably empty");
955 if (match_criteria && *match_criteria) {
956 matchStr = match_criteria;
958 if (zh->m_record_id && *zh->m_record_id) {
959 matchStr = fileMatchStr (zh, &zh->reg->keys, pr_fname,
963 logf (LOG_WARN, "Bad match criteria (recordID)");
969 rinfo = dict_lookup (zh->reg->matchDict, matchStr);
971 memcpy (sysno, rinfo+1, sizeof(*sysno));
980 logf (LOG_LOG, "delete %s %s %ld", recordType,
981 pr_fname, (long) recordOffset);
982 logf (LOG_WARN, "cannot delete record above (seems new)");
985 logf (LOG_LOG, "add %s %s %ld", recordType, pr_fname,
986 (long) recordOffset);
987 rec = rec_new (zh->reg->records);
991 recordAttr = rec_init_attr (zh->reg->zei, rec);
995 dict_insert (zh->reg->matchDict, matchStr,
996 sizeof(*sysno), sysno);
998 extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys);
999 extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys);
1001 zh->records_inserted++;
1005 /* record already exists */
1006 struct recKeys delkeys;
1007 struct sortKeys sortKeys;
1009 if (!allow_update) {
1010 logf (LOG_LOG, "skipped %s %s %ld",
1011 recordType, pr_fname, (long) recordOffset);
1016 rec = rec_get (zh->reg->records, *sysno);
1019 recordAttr = rec_init_attr (zh->reg->zei, rec);
1021 if (!force_update) {
1022 if (recordAttr->runNumber ==
1023 zebraExplain_runNumberIncrement (zh->reg->zei, 0))
1025 logf (LOG_LOG, "skipped %s %s %ld", recordType,
1026 pr_fname, (long) recordOffset);
1027 extract_flushSortKeys (zh, *sysno, -1, &zh->reg->sortKeys);
1034 delkeys.buf_used = rec->size[recInfo_delKeys];
1035 delkeys.buf = rec->info[recInfo_delKeys];
1037 sortKeys.buf_used = rec->size[recInfo_sortKeys];
1038 sortKeys.buf = rec->info[recInfo_sortKeys];
1040 extract_flushSortKeys (zh, *sysno, 0, &sortKeys);
1041 extract_flushRecordKeys (zh, *sysno, 0, &delkeys);
1044 /* record going to be deleted */
1045 if (!delkeys.buf_used)
1047 logf (LOG_LOG, "delete %s %s %ld", recordType,
1048 pr_fname, (long) recordOffset);
1049 logf (LOG_WARN, "cannot delete file above, storeKeys false");
1053 logf (LOG_LOG, "delete %s %s %ld", recordType,
1054 pr_fname, (long) recordOffset);
1055 zh->records_deleted++;
1057 dict_delete (zh->reg->matchDict, matchStr);
1058 rec_del (zh->reg->records, &rec);
1066 /* record going to be updated */
1067 if (!delkeys.buf_used)
1069 logf (LOG_LOG, "update %s %s %ld", recordType,
1070 pr_fname, (long) recordOffset);
1071 logf (LOG_WARN, "cannot update file above, storeKeys false");
1075 logf (LOG_LOG, "update %s %s %ld", recordType,
1076 pr_fname, (long) recordOffset);
1077 extract_flushSortKeys (zh, *sysno, 1, &zh->reg->sortKeys);
1078 extract_flushRecordKeys (zh, *sysno, 1, &zh->reg->keys);
1079 zh->records_updated++;
1083 /* update file type */
1084 xfree (rec->info[recInfo_fileType]);
1085 rec->info[recInfo_fileType] =
1086 rec_strdup (recordType, &rec->size[recInfo_fileType]);
1088 /* update filename */
1089 xfree (rec->info[recInfo_filename]);
1090 rec->info[recInfo_filename] =
1091 rec_strdup (fname, &rec->size[recInfo_filename]);
1093 /* update delete keys */
1094 xfree (rec->info[recInfo_delKeys]);
1095 if (zh->reg->keys.buf_used > 0 && zh->m_store_keys == 1)
1097 rec->size[recInfo_delKeys] = zh->reg->keys.buf_used;
1098 rec->info[recInfo_delKeys] = zh->reg->keys.buf;
1099 zh->reg->keys.buf = NULL;
1100 zh->reg->keys.buf_max = 0;
1104 rec->info[recInfo_delKeys] = NULL;
1105 rec->size[recInfo_delKeys] = 0;
1108 /* update sort keys */
1109 xfree (rec->info[recInfo_sortKeys]);
1111 rec->size[recInfo_sortKeys] = zh->reg->sortKeys.buf_used;
1112 rec->info[recInfo_sortKeys] = zh->reg->sortKeys.buf;
1113 zh->reg->sortKeys.buf = NULL;
1114 zh->reg->sortKeys.buf_max = 0;
1116 /* save file size of original record */
1117 zebraExplain_recordBytesIncrement (zh->reg->zei,
1118 - recordAttr->recordSize);
1120 recordAttr->recordSize = fi->file_moffset - recordOffset;
1121 if (!recordAttr->recordSize)
1122 recordAttr->recordSize = fi->file_max - recordOffset;
1124 recordAttr->recordSize = buf_size;
1126 zebraExplain_recordBytesIncrement (zh->reg->zei,
1127 recordAttr->recordSize);
1129 /* set run-number for this record */
1130 recordAttr->runNumber =
1131 zebraExplain_runNumberIncrement (zh->reg->zei, 0);
1133 /* update store data */
1134 xfree (rec->info[recInfo_storeData]);
1135 if (zh->m_store_data)
1137 rec->size[recInfo_storeData] = recordAttr->recordSize;
1138 rec->info[recInfo_storeData] = (char *)
1139 xmalloc (recordAttr->recordSize);
1140 memcpy (rec->info[recInfo_storeData], buf, recordAttr->recordSize);
1144 rec->info[recInfo_storeData] = NULL;
1145 rec->size[recInfo_storeData] = 0;
1147 /* update database name */
1148 xfree (rec->info[recInfo_databaseName]);
1149 rec->info[recInfo_databaseName] =
1150 rec_strdup (zh->basenames[0], &rec->size[recInfo_databaseName]);
1153 recordAttr->recordOffset = recordOffset;
1155 /* commit this record */
1156 rec_put (zh->reg->records, &rec);
1161 int explain_extract (void *handle, Record rec, data1_node *n)
1163 ZebraHandle zh = (ZebraHandle) handle;
1164 struct recExtractCtrl extractCtrl;
1167 if (zebraExplain_curDatabase (zh->reg->zei,
1168 rec->info[recInfo_databaseName]))
1171 if (zebraExplain_newDatabase (zh->reg->zei,
1172 rec->info[recInfo_databaseName], 0))
1176 zh->reg->keys.buf_used = 0;
1177 zh->reg->keys.prevAttrUse = -1;
1178 zh->reg->keys.prevAttrSet = -1;
1179 zh->reg->keys.prevSeqNo = 0;
1180 zh->reg->sortKeys.buf_used = 0;
1182 extractCtrl.init = extract_init;
1183 extractCtrl.tokenAdd = extract_token_add;
1184 extractCtrl.schemaAdd = extract_schema_add;
1185 extractCtrl.dh = zh->reg->dh;
1186 for (i = 0; i<256; i++)
1187 extractCtrl.seqno[i] = 0;
1188 extractCtrl.zebra_maps = zh->reg->zebra_maps;
1189 extractCtrl.flagShowRecords = 0;
1190 extractCtrl.handle = handle;
1193 grs_extract_tree(&extractCtrl, n);
1195 if (rec->size[recInfo_delKeys])
1197 struct recKeys delkeys;
1198 struct sortKeys sortkeys;
1200 delkeys.buf_used = rec->size[recInfo_delKeys];
1201 delkeys.buf = rec->info[recInfo_delKeys];
1203 sortkeys.buf_used = rec->size[recInfo_sortKeys];
1204 sortkeys.buf = rec->info[recInfo_sortKeys];
1206 extract_flushSortKeys (zh, rec->sysno, 0, &sortkeys);
1207 extract_flushRecordKeys (zh, rec->sysno, 0, &delkeys);
1209 extract_flushRecordKeys (zh, rec->sysno, 1, &zh->reg->keys);
1210 extract_flushSortKeys (zh, rec->sysno, 1, &zh->reg->sortKeys);
1212 xfree (rec->info[recInfo_delKeys]);
1213 rec->size[recInfo_delKeys] = zh->reg->keys.buf_used;
1214 rec->info[recInfo_delKeys] = zh->reg->keys.buf;
1215 zh->reg->keys.buf = NULL;
1216 zh->reg->keys.buf_max = 0;
1218 xfree (rec->info[recInfo_sortKeys]);
1219 rec->size[recInfo_sortKeys] = zh->reg->sortKeys.buf_used;
1220 rec->info[recInfo_sortKeys] = zh->reg->sortKeys.buf;
1221 zh->reg->sortKeys.buf = NULL;
1222 zh->reg->sortKeys.buf_max = 0;
1227 void extract_flushRecordKeys (ZebraHandle zh, SYSNO sysno,
1228 int cmd, struct recKeys *reckeys)
1232 unsigned char attrSet = (unsigned char) -1;
1233 unsigned short attrUse = (unsigned short) -1;
1238 ZebraExplainInfo zei = zh->reg->zei;
1240 if (!zh->reg->key_buf)
1242 int mem= 1024*1024* atoi( res_get_def( zh->res, "memmax", "8"));
1245 logf(LOG_WARN, "Invalid memory setting, using default 8 MB");
1248 /* FIXME: That "8" should be in a default settings include */
1249 /* not hard-coded here! -H */
1250 zh->reg->key_buf = (char**) xmalloc (mem);
1251 zh->reg->ptr_top = mem/sizeof(char*);
1253 zh->reg->key_buf_used = 0;
1254 zh->reg->key_file_no = 0;
1256 zebraExplain_recordCountIncrement (zei, cmd ? 1 : -1);
1257 while (off < reckeys->buf_used)
1259 const char *src = reckeys->buf + off;
1268 memcpy (&ch, src, sizeof(ch));
1274 memcpy (&attrSet, src, sizeof(attrSet));
1275 src += sizeof(attrSet);
1279 memcpy (&attrUse, src, sizeof(attrUse));
1280 src += sizeof(attrUse);
1283 if (zh->reg->key_buf_used + 1024 >
1284 (zh->reg->ptr_top -zh->reg->ptr_i)*sizeof(char*))
1285 extract_flushWriteKeys (zh,0);
1287 (zh->reg->key_buf)[zh->reg->ptr_top - zh->reg->ptr_i] =
1288 (char*)zh->reg->key_buf + zh->reg->key_buf_used;
1291 ch = zebraExplain_lookupSU (zei, attrSet, attrUse);
1293 ch = zebraExplain_addSU (zei, attrSet, attrUse);
1296 zh->reg->key_buf_used +=
1297 key_SU_encode (ch,((char*)zh->reg->key_buf) +
1298 zh->reg->key_buf_used);
1301 ((char*)zh->reg->key_buf) [(zh->reg->key_buf_used)++] = *src++;
1303 ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = '\0';
1304 ((char*)(zh->reg->key_buf))[(zh->reg->key_buf_used)++] = cmd;
1307 seqno += ((lead>>2) & 15)-1;
1310 memcpy (&seqno, src, sizeof(seqno));
1311 src += sizeof(seqno);
1315 memcpy ((char*)zh->reg->key_buf + zh->reg->key_buf_used, &key, sizeof(key));
1316 (zh->reg->key_buf_used) += sizeof(key);
1317 off = src - reckeys->buf;
1319 assert (off == reckeys->buf_used);
1322 void extract_flushWriteKeys (ZebraHandle zh, int final)
1323 /* optimizing: if final=1, and no files written yet */
1324 /* push the keys directly to merge, sidestepping the */
1325 /* temp file altogether. Speeds small updates */
1328 char out_fname[200];
1330 struct encode_info encode_info;
1331 int ptr_i = zh->reg->ptr_i;
1336 if (!zh->reg->key_buf || ptr_i <= 0)
1338 logf (LOG_DEBUG, " nothing to flush section=%d buf=%p i=%d",
1339 zh->reg->key_file_no, zh->reg->key_buf, ptr_i);
1340 logf (LOG_DEBUG, " buf=%p ",
1342 logf (LOG_DEBUG, " ptr=%d ",zh->reg->ptr_i);
1343 logf (LOG_DEBUG, " reg=%p ",zh->reg);
1348 (zh->reg->key_file_no)++;
1349 logf (LOG_LOG, "sorting section %d", (zh->reg->key_file_no));
1350 logf (LOG_DEBUG, " sort_buff at %p n=%d",
1351 zh->reg->key_buf + zh->reg->ptr_top - ptr_i,ptr_i);
1353 qsort (zh->reg->key_buf + zh->reg->ptr_top - ptr_i, ptr_i,
1354 sizeof(char*), key_qsort_compare);
1356 /* Case 1: always use temp files (old way) */
1357 /* Case 2: use temp files, if more than one (auto) */
1358 /* = if this is both the last and the first */
1359 /* Case 3: never bother with temp files (new) */
1361 /* FIXME - will come from config file into zh */
1363 if ( ( temp_policy ==3 ) || /* always from memory */
1364 ( ( temp_policy ==2 ) && /* automatic */
1365 (zh->reg->key_file_no == 1) && /* this is first time */
1366 (final) ) ) /* and last (=only) time */
1367 { /* go directly from memory */
1368 zh->reg->key_file_no =0; /* signal not to read files */
1369 zebra_index_merge(zh);
1371 zh->reg->key_buf_used = 0;
1375 /* Not doing directly from memory, write into a temp file */
1376 extract_get_fname_tmp (zh, out_fname, zh->reg->key_file_no);
1378 if (!(outf = fopen (out_fname, "wb")))
1380 logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname);
1383 logf (LOG_LOG, "writing section %d", zh->reg->key_file_no);
1384 prevcp = cp = (zh->reg->key_buf)[zh->reg->ptr_top - ptr_i];
1386 encode_key_init (&encode_info);
1387 encode_key_write (cp, &encode_info, outf);
1391 cp = (zh->reg->key_buf)[zh->reg->ptr_top - ptr_i];
1392 if (strcmp (cp, prevcp))
1394 encode_key_flush ( &encode_info, outf);
1395 encode_key_init (&encode_info);
1396 encode_key_write (cp, &encode_info, outf);
1400 encode_key_write (cp + strlen(cp), &encode_info, outf);
1402 encode_key_flush ( &encode_info, outf);
1404 qsort (key_buf + ptr_top-ptr_i, ptr_i, sizeof(char*), key_x_compare);
1405 extract_get_fname_tmp (out_fname, key_file_no);
1407 if (!(outf = fopen (out_fname, "wb")))
1409 logf (LOG_FATAL|LOG_ERRNO, "fopen %s", out_fname);
1412 logf (LOG_LOG, "writing section %d", key_file_no);
1414 prevcp = key_buf[ptr_top-i];
1416 if (!--i || strcmp (prevcp, key_buf[ptr_top-i]))
1418 key_y_len = strlen(prevcp)+1;
1420 logf (LOG_LOG, "key_y_len: %2d %02x %02x %s",
1421 key_y_len, prevcp[0], prevcp[1], 2+prevcp);
1423 qsort (key_buf + ptr_top-ptr_i, ptr_i - i,
1424 sizeof(char*), key_y_compare);
1425 cp = key_buf[ptr_top-ptr_i];
1427 encode_key_init (&encode_info);
1428 encode_key_write (cp, &encode_info, outf);
1431 cp = key_buf[ptr_top-ptr_i];
1432 encode_key_write (cp+key_y_len, &encode_info, outf);
1434 encode_key_flush ( &encode_info, outf);
1437 prevcp = key_buf[ptr_top-ptr_i];
1442 logf (LOG_FATAL|LOG_ERRNO, "fclose %s", out_fname);
1445 logf (LOG_LOG, "finished section %d", zh->reg->key_file_no);
1447 zh->reg->key_buf_used = 0;
1450 void extract_add_index_string (RecWord *p, const char *string,
1454 unsigned char attrSet;
1455 unsigned short attrUse;
1458 int *pseqno = &p->seqno;
1459 ZebraHandle zh = p->extractCtrl->handle;
1460 ZebraExplainInfo zei = zh->reg->zei;
1461 struct recKeys *keys = &zh->reg->keys;
1463 if (keys->buf_used+1024 > keys->buf_max)
1467 b = (char *) xmalloc (keys->buf_max += 128000);
1468 if (keys->buf_used > 0)
1469 memcpy (b, keys->buf, keys->buf_used);
1473 dst = keys->buf + keys->buf_used;
1475 attrSet = p->attrSet;
1476 if (keys->buf_used > 0 && keys->prevAttrSet == attrSet)
1479 keys->prevAttrSet = attrSet;
1480 attrUse = p->attrUse;
1481 if (keys->buf_used > 0 && keys->prevAttrUse == attrUse)
1484 keys->prevAttrUse = attrUse;
1486 diff = 1 + *pseqno - keys->prevSeqNo;
1487 if (diff >= 1 && diff <= 15)
1488 lead |= (diff << 2);
1492 keys->prevSeqNo = *pseqno;
1499 int ch = zebraExplain_lookupSU (zei, attrSet, attrUse);
1502 ch = zebraExplain_addSU (zei, attrSet, attrUse);
1503 yaz_log (LOG_DEBUG, "addSU set=%d use=%d SU=%d",
1504 attrSet, attrUse, ch);
1507 memcpy (dst, &ch, sizeof(ch));
1513 memcpy (dst, &attrSet, sizeof(attrSet));
1514 dst += sizeof(attrSet);
1518 memcpy (dst, &attrUse, sizeof(attrUse));
1519 dst += sizeof(attrUse);
1522 *dst++ = p->reg_type;
1523 memcpy (dst, string, length);
1529 memcpy (dst, pseqno, sizeof(*pseqno));
1530 dst += sizeof(*pseqno);
1532 keys->buf_used = dst - keys->buf;
1535 static void extract_add_sort_string (RecWord *p, const char *string,
1538 ZebraHandle zh = p->extractCtrl->handle;
1539 struct sortKeys *sk = &zh->reg->sortKeys;
1542 while (off < sk->buf_used)
1546 off += key_SU_decode(&set, sk->buf + off);
1547 off += key_SU_decode(&use, sk->buf + off);
1548 off += key_SU_decode(&slen, sk->buf + off);
1550 if (p->attrSet == set && p->attrUse == use)
1553 assert (off == sk->buf_used);
1555 if (sk->buf_used + IT_MAX_WORD > sk->buf_max)
1559 b = (char *) xmalloc (sk->buf_max += 128000);
1560 if (sk->buf_used > 0)
1561 memcpy (b, sk->buf, sk->buf_used);
1565 off += key_SU_encode(p->attrSet, sk->buf + off);
1566 off += key_SU_encode(p->attrUse, sk->buf + off);
1567 off += key_SU_encode(length, sk->buf + off);
1568 memcpy (sk->buf + off, string, length);
1569 sk->buf_used = off + length;
1572 void extract_add_string (RecWord *p, const char *string, int length)
1574 assert (length > 0);
1575 if (zebra_maps_is_sort (p->zebra_maps, p->reg_type))
1576 extract_add_sort_string (p, string, length);
1578 extract_add_index_string (p, string, length);
1581 static void extract_add_incomplete_field (RecWord *p)
1583 const char *b = p->string;
1584 int remain = p->length;
1585 const char **map = 0;
1588 map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
1592 char buf[IT_MAX_WORD+1];
1596 while (map && *map && **map == *CHR_SPACE)
1598 remain = p->length - (b - p->string);
1600 map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
1607 while (map && *map && **map != *CHR_SPACE)
1609 const char *cp = *map;
1611 while (i < IT_MAX_WORD && *cp)
1613 remain = p->length - (b - p->string);
1615 map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
1621 extract_add_string (p, buf, i);
1626 static void extract_add_complete_field (RecWord *p)
1628 const char *b = p->string;
1629 char buf[IT_MAX_WORD+1];
1630 const char **map = 0;
1631 int i = 0, remain = p->length;
1634 map = zebra_maps_input (p->zebra_maps, p->reg_type, &b, remain);
1636 while (remain > 0 && i < IT_MAX_WORD)
1638 while (map && *map && **map == *CHR_SPACE)
1640 remain = p->length - (b - p->string);
1642 map = zebra_maps_input(p->zebra_maps, p->reg_type, &b, remain);
1649 if (i && i < IT_MAX_WORD)
1650 buf[i++] = *CHR_SPACE;
1651 while (map && *map && **map != *CHR_SPACE)
1653 const char *cp = *map;
1655 if (i >= IT_MAX_WORD)
1657 while (i < IT_MAX_WORD && *cp)
1659 remain = p->length - (b - p->string);
1661 map = zebra_maps_input (p->zebra_maps, p->reg_type, &b,
1669 extract_add_string (p, buf, i);
1672 void extract_token_add (RecWord *p)
1676 yaz_log (LOG_LOG, "token_add "
1677 "reg_type=%c attrSet=%d attrUse=%d seqno=%d s=%.*s",
1678 p->reg_type, p->attrSet, p->attrUse, p->seqno, p->length,
1681 if ((wrbuf = zebra_replace(p->zebra_maps, p->reg_type, 0,
1682 p->string, p->length)))
1684 p->string = wrbuf_buf(wrbuf);
1685 p->length = wrbuf_len(wrbuf);
1687 if (zebra_maps_is_complete (p->zebra_maps, p->reg_type))
1688 extract_add_complete_field (p);
1690 extract_add_incomplete_field(p);
1693 void extract_schema_add (struct recExtractCtrl *p, Odr_oid *oid)
1695 ZebraHandle zh = (ZebraHandle) (p->handle);
1696 zebraExplain_addSchema (zh->reg->zei, oid);
1699 void extract_flushSortKeys (ZebraHandle zh, SYSNO sysno,
1700 int cmd, struct sortKeys *sk)
1702 SortIdx sortIdx = zh->reg->sortIdx;
1705 sortIdx_sysno (sortIdx, sysno);
1707 while (off < sk->buf_used)
1711 off += key_SU_decode(&set, sk->buf + off);
1712 off += key_SU_decode(&use, sk->buf + off);
1713 off += key_SU_decode(&slen, sk->buf + off);
1715 sortIdx_type(sortIdx, use);
1717 sortIdx_add(sortIdx, sk->buf + off, slen);
1719 sortIdx_add(sortIdx, "", 1);
1724 void encode_key_init (struct encode_info *i)
1735 char *encode_key_int (int d, char *bp)
1739 else if (d <= 16383)
1741 *bp++ = 64 + (d>>8);
1744 else if (d <= 4194303)
1746 *bp++ = 128 + (d>>16);
1747 *bp++ = (d>>8) & 255;
1752 *bp++ = 192 + (d>>24);
1753 *bp++ = (d>>16) & 255;
1754 *bp++ = (d>>8) & 255;
1762 /* this is the old encode_key_write
1763 * may be deleted once we are confident that the new works
1766 void encode_key_write (char *k, struct encode_info *i, FILE *outf)
1771 while ((*bp++ = *k++))
1773 memcpy (&key, k+1, sizeof(struct it_key));
1774 bp = encode_key_int ( (key.sysno - i->sysno) * 2 + *k, bp);
1775 if (i->sysno != key.sysno)
1777 i->sysno = key.sysno;
1780 else if (!i->seqno && !key.seqno && i->cmd == *k)
1782 bp = encode_key_int (key.seqno - i->seqno, bp);
1783 i->seqno = key.seqno;
1785 if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)
1787 logf (LOG_FATAL|LOG_ERRNO, "fwrite");
1792 void encode_key_flush (struct encode_info *i, FILE *outf)
1793 { /* dummy routine */
1798 /* new encode_key_write
1799 * The idea is to buffer one more key, and compare them
1800 * If we are going to delete and insert the same key,
1801 * we may as well not bother. Should make a difference in
1802 * updates with small modifications (appending to a mbox)
1804 void encode_key_write (char *k, struct encode_info *i, FILE *outf)
1809 if (*k) /* first time for new key */
1812 while ((*bp++ = *k++))
1814 i->keylen= bp - i->buf -1;
1815 assert(i->keylen+1+sizeof(struct it_key) < ENCODE_BUFLEN);
1819 bp=i->buf + i->keylen;
1824 memcpy (&key, k+1, sizeof(struct it_key));
1825 if (0==i->prevsys) /* no previous filter, fill up */
1827 i->prevsys=key.sysno;
1828 i->prevseq=key.seqno;
1831 else if ( (i->prevsys==key.sysno) &&
1832 (i->prevseq==key.seqno) &&
1834 { /* same numbers, diff cmd, they cancel out */
1838 { /* different stuff, write previous, move buf */
1839 bp = encode_key_int ( (i->prevsys - i->sysno) * 2 + i->prevcmd, bp);
1840 if (i->sysno != i->prevsys)
1842 i->sysno = i->prevsys;
1845 else if (!i->seqno && !i->prevseq && i->cmd == i->prevcmd)
1847 return; /* ??? Filters some sort of duplicates away */
1848 /* ??? Can this ever happen -H 15oct02 */
1850 bp = encode_key_int (i->prevseq - i->seqno, bp);
1851 i->seqno = i->prevseq;
1852 i->cmd = i->prevcmd;
1853 if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)
1855 logf (LOG_FATAL|LOG_ERRNO, "fwrite");
1858 i->keylen=0; /* ok, it's written, forget it */
1859 i->prevsys=key.sysno;
1860 i->prevseq=key.seqno;
1865 void encode_key_flush (struct encode_info *i, FILE *outf)
1866 { /* flush the last key from i */
1867 char *bp =i->buf + i->keylen;
1870 return; /* nothing to flush */
1873 bp = encode_key_int ( (i->prevsys - i->sysno) * 2 + i->prevcmd, bp);
1874 if (i->sysno != i->prevsys)
1876 i->sysno = i->prevsys;
1879 else if (!i->seqno && !i->prevseq && i->cmd == i->prevcmd)
1881 return; /* ??? Filters some sort of duplicates away */
1882 /* ??? Can this ever happen -H 15oct02 */
1884 bp = encode_key_int (i->prevseq - i->seqno, bp);
1885 i->seqno = i->prevseq;
1886 i->cmd = i->prevcmd;
1887 if (fwrite (i->buf, bp - i->buf, 1, outf) != 1)
1889 logf (LOG_FATAL|LOG_ERRNO, "fwrite");
1892 i->keylen=0; /* ok, it's written, forget it */
1893 i->prevsys=0; /* forget the values too */