From bb692fa6c0b70501de135231ef51d92c95df1075 Mon Sep 17 00:00:00 2001 From: Adam Dickmeiss Date: Fri, 29 Nov 2013 14:21:53 +0100 Subject: [PATCH] Work on cluster merging; part of PAZ-901 --- src/reclists.c | 38 +++++++++++++++++++++++++++++++------- src/reclists.h | 1 + src/relevance.c | 31 ++++++++++++++----------------- src/session.c | 53 ++++++++++++++++++++++++++++++++++++++--------------- src/session.h | 1 + 5 files changed, 85 insertions(+), 39 deletions(-) diff --git a/src/reclists.c b/src/reclists.c index 73a6a64..9d25970 100644 --- a/src/reclists.c +++ b/src/reclists.c @@ -358,8 +358,28 @@ int reclist_get_num_records(struct reclist *l) return 0; } +static void merge_cluster(struct reclist *l, + struct relevance *r, + struct record_cluster *dst, + struct record_cluster **src) +{ +#if 0 + dst->metadata = (*src)->metadata; + dst->sortkeys = (*src)->sortkeys; + int relevance_score; + int *term_frequency_vec; + float *term_frequency_vecf; + // Set-specific ID for this record + char *recid; + WRBUF relevance_explain1; + WRBUF relevance_explain2; + struct record *records; +#endif +} + // Insert a record. Return record cluster (newly formed or pre-existing) struct record_cluster *reclist_insert(struct reclist *l, + struct relevance *r, struct conf_service *service, struct record *record, struct record_metadata_attr *merge_keys, @@ -393,8 +413,7 @@ struct record_cluster *reclist_insert(struct reclist *l, { struct record **re; - cluster = (*p)->record; - for (re = &cluster->records; *re; re = &(*re)->next) + for (re = &(*p)->record->records; *re; re = &(*re)->next) { if ((*re)->client == record->client && record_compare(record, *re, service)) @@ -403,14 +422,19 @@ struct record_cluster *reclist_insert(struct reclist *l, return 0; } } - *re = record; - record->next = 0; - goto out; + + if (!cluster) + { + cluster = (*p)->record; + *re = record; + record->next = 0; + } + else + merge_cluster(l, r, cluster, &(*p)->record); } } } } -out: if (!cluster) { struct reclist_bucket *new = @@ -427,7 +451,6 @@ out: append_merge_keys(&cluster->merge_keys, merge_keys, l->nmem); cluster->relevance_score = 0; - cluster->term_frequency_vec = 0; cluster->recid = cluster->merge_keys->value; (*total)++; cluster->metadata = @@ -440,6 +463,7 @@ out: memset(cluster->sortkeys, 0, sizeof(union data_types*) * service->num_sortkeys); + relevance_newrec(r, cluster); cluster->relevance_explain1 = wrbuf_alloc(); cluster->relevance_explain2 = wrbuf_alloc(); /* attach to hash list */ diff --git a/src/reclists.h b/src/reclists.h index 769b0c2..ea4d263 100644 --- a/src/reclists.h +++ b/src/reclists.h @@ -39,6 +39,7 @@ struct reclist *reclist_create(NMEM); void reclist_destroy(struct reclist *l); void reclist_limit(struct reclist *l, struct session *session, int lazy); struct record_cluster *reclist_insert(struct reclist *tl, + struct relevance *r, struct conf_service *service, struct record *record, struct record_metadata_attr *merge_keys, diff --git a/src/relevance.c b/src/relevance.c index 08527ae..63558fb 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -320,24 +320,21 @@ void relevance_destroy(struct relevance **rp) void relevance_newrec(struct relevance *r, struct record_cluster *rec) { - if (!rec->term_frequency_vec) - { - int i; - - // term frequency [1,..] . [0] is total length of all fields - rec->term_frequency_vec = - nmem_malloc(r->nmem, - r->vec_len * sizeof(*rec->term_frequency_vec)); - for (i = 0; i < r->vec_len; i++) - rec->term_frequency_vec[i] = 0; + int i; - // term frequency divided by length of field [1,...] - rec->term_frequency_vecf = - nmem_malloc(r->nmem, - r->vec_len * sizeof(*rec->term_frequency_vecf)); - for (i = 0; i < r->vec_len; i++) - rec->term_frequency_vecf[i] = 0.0; - } + // term frequency [1,..] . [0] is total length of all fields + rec->term_frequency_vec = + nmem_malloc(r->nmem, + r->vec_len * sizeof(*rec->term_frequency_vec)); + for (i = 0; i < r->vec_len; i++) + rec->term_frequency_vec[i] = 0; + + // term frequency divided by length of field [1,...] + rec->term_frequency_vecf = + nmem_malloc(r->nmem, + r->vec_len * sizeof(*rec->term_frequency_vecf)); + for (i = 0; i < r->vec_len; i++) + rec->term_frequency_vecf[i] = 0.0; } void relevance_donerecord(struct relevance *r, struct record_cluster *cluster) diff --git a/src/session.c b/src/session.c index 9e52d59..2039c58 100644 --- a/src/session.c +++ b/src/session.c @@ -1728,24 +1728,50 @@ int ingest_record(struct client *cl, const char *rec, if (!strcmp((const char *) root->name, "cluster")) { + int no_merge_keys = 0; + int no_merge_dups = 0; xmlNode *sroot; + struct record_metadata_attr *mk = 0; + for (sroot = root->children; sroot; sroot = sroot->next) - if (sroot->type == XML_ELEMENT_NODE) + if (sroot->type == XML_ELEMENT_NODE && + !strcmp((const char *) sroot->name, "record")) { + struct record_metadata_attr **mkp; const char *mergekey_norm = get_mergekey(xdoc, sroot, cl, record_no, service, nmem, - se->mergekey); - - struct record_metadata_attr *mk = (struct record_metadata_attr*) - nmem_malloc(nmem, sizeof(*mk)); - mk->name = 0; - mk->value = nmem_strdup(nmem, mergekey_norm); - mk->next = 0; - + se->mergekey); + if (!mergekey_norm) + { + r = -1; + break; + } + for (mkp = &mk; *mkp; mkp = &(*mkp)->next) + if (!strcmp((*mkp)->value, mergekey_norm)) + break; + if (!*mkp) + { + *mkp = (struct record_metadata_attr*) + nmem_malloc(nmem, sizeof(**mkp)); + (*mkp)->name = 0; + (*mkp)->value = nmem_strdup(nmem, mergekey_norm); + (*mkp)->next = 0; + no_merge_keys++; + } + else + no_merge_dups++; + } + if (no_merge_keys > 1 || no_merge_dups > 0) + { + yaz_log(YLOG_LOG, "Got %d mergekeys, %d dups for position %d", + no_merge_keys, no_merge_dups, record_no); + } + for (sroot = root->children; !r && sroot; sroot = sroot->next) + if (sroot->type == XML_ELEMENT_NODE && + !strcmp((const char *) sroot->name, "record")) + { r = ingest_sub_record(cl, xdoc, sroot, record_no, nmem, sdb, mk); - if (r) - break; } } else if (!strcmp((const char *) root->name, "record")) @@ -2038,7 +2064,7 @@ static int ingest_to_cluster(struct client *cl, xmlFree(value); return -2; } - cluster = reclist_insert(se->reclist, service, record, + cluster = reclist_insert(se->reclist, se->relevance, service, record, merge_keys, &se->total_merged); if (!cluster) return 0; // complete match with existing record @@ -2061,9 +2087,6 @@ static int ingest_to_cluster(struct client *cl, session_log(se, YLOG_LOG, "Cluster id %s from %s (#%d)", cluster->recid, sdb->database->id, record_no); - - relevance_newrec(se->relevance, cluster); - // original metadata, to check if first existence of a field metadata0 = xmalloc(sizeof(*metadata0) * service->num_metadata); memcpy(metadata0, cluster->metadata, diff --git a/src/session.h b/src/session.h index 85f1b8f..d58019f 100644 --- a/src/session.h +++ b/src/session.h @@ -26,6 +26,7 @@ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA #include #include "facet_limit.h" +#include "relevance.h" #include "reclists.h" struct record; -- 1.7.10.4