From: Adam Dickmeiss Date: Wed, 5 Jun 2013 13:45:25 +0000 (+0200) Subject: Use YAZ' new icu_iter_get_org_info for snippets X-Git-Tag: v1.6.32~7 X-Git-Url: http://jsfdemo.indexdata.com/cgi-bin?a=commitdiff_plain;h=b22c4ae2c14eb5d2b991e6233cdea2a9fd7dcf6c;p=pazpar2-moved-to-github.git Use YAZ' new icu_iter_get_org_info for snippets --- diff --git a/src/charsets.c b/src/charsets.c index f931445..9688628 100644 --- a/src/charsets.c +++ b/src/charsets.c @@ -53,6 +53,8 @@ struct pp2_charset_s { const char *(*token_next_handler)(pp2_charset_token_t prt); const char *(*get_sort_handler)(pp2_charset_token_t prt); const char *(*get_display_handler)(pp2_charset_token_t prt); + void (*get_org_handler)(pp2_charset_token_t ptr, + size_t *start, size_t *len); #if YAZ_HAVE_ICU struct icu_chain * icu_chn; UErrorCode icu_sts; @@ -63,11 +65,15 @@ static const char *pp2_charset_token_null(pp2_charset_token_t prt); static const char *pp2_charset_token_a_to_z(pp2_charset_token_t prt); static const char *pp2_get_sort_ascii(pp2_charset_token_t prt); static const char *pp2_get_display_ascii(pp2_charset_token_t prt); +static void pp2_get_org_ascii(pp2_charset_token_t prt, + size_t *start, size_t *len); #if YAZ_HAVE_ICU static const char *pp2_charset_token_icu(pp2_charset_token_t prt); static const char *pp2_get_sort_icu(pp2_charset_token_t prt); static const char *pp2_get_display_icu(pp2_charset_token_t prt); +static void pp2_get_org_icu(pp2_charset_token_t prt, + size_t *start, size_t *len); #endif /* tokenzier handle */ @@ -80,6 +86,9 @@ struct pp2_charset_token_s { #if YAZ_HAVE_ICU yaz_icu_iter_t iter; #endif + const char *cp0; + size_t start; + size_t len; }; struct pp2_charset_fact_s { @@ -226,6 +235,7 @@ pp2_charset_t pp2_charset_create(void) pct->token_next_handler = pp2_charset_token_null; pct->get_sort_handler = pp2_get_sort_ascii; pct->get_display_handler = pp2_get_display_ascii; + pct->get_org_handler = pp2_get_org_ascii; #if YAZ_HAVE_ICU pct->icu_chn = 0; #endif // YAZ_HAVE_ICU @@ -250,6 +260,7 @@ pp2_charset_t pp2_charset_create_icu(struct icu_chain *icu_chn) pct->token_next_handler = pp2_charset_token_icu; pct->get_sort_handler = pp2_get_sort_icu; pct->get_display_handler = pp2_get_display_icu; + pct->get_org_handler = pp2_get_org_icu; } return pct; } @@ -290,6 +301,8 @@ pp2_charset_token_t pp2_charset_tokenize(pp2_charset_t pct) if (pct->icu_chn) prt->iter = icu_iter_create(pct->icu_chn); #endif + prt->start = 0; + prt->len = 0; return prt; } @@ -313,6 +326,7 @@ void pp2_charset_token_first(pp2_charset_token_t prt, wrbuf_rewind(prt->norm_str); wrbuf_rewind(prt->sort_str); + prt->cp0 = buf; prt->cp = buf; prt->last_cp = 0; @@ -354,6 +368,12 @@ const char *pp2_get_display(pp2_charset_token_t prt) return prt->pct->get_display_handler(prt); } +void pp2_get_org(pp2_charset_token_t prt, size_t *start, size_t *len) +{ + prt->pct->get_org_handler(prt, start, len); +} + + #define raw_char(c) (((c) >= 'a' && (c) <= 'z') ? (c) : -1) /* original tokenizer with our tokenize interface, but we add +1 to ensure no '\0' are in our string (except for EOF) @@ -363,6 +383,7 @@ static const char *pp2_charset_token_a_to_z(pp2_charset_token_t prt) const char *cp = prt->cp; int c; + prt->start = cp - prt->cp0; /* skip white space */ while (*cp && (c = raw_char(tolower(*(const unsigned char *)cp))) < 0) cp++; @@ -381,6 +402,7 @@ static const char *pp2_charset_token_a_to_z(pp2_charset_token_t prt) wrbuf_putc(prt->norm_str, c); cp++; } + prt->len = (cp - prt->cp0) - prt->start; prt->cp = cp; return wrbuf_cstr(prt->norm_str); } @@ -412,6 +434,13 @@ static const char *pp2_get_display_ascii(pp2_charset_token_t prt) } } +static void pp2_get_org_ascii(pp2_charset_token_t prt, + size_t *start, size_t *len) +{ + *start = prt->start; + *len = prt->len; +} + static const char *pp2_charset_token_null(pp2_charset_token_t prt) { const char *cp = prt->cp; @@ -420,6 +449,7 @@ static const char *pp2_charset_token_null(pp2_charset_token_t prt) while (*cp) cp++; prt->cp = cp; + prt->len = cp - prt->cp0; return prt->last_cp; } @@ -443,6 +473,11 @@ static const char *pp2_get_display_icu(pp2_charset_token_t prt) return icu_iter_get_display(prt->iter); } +static void pp2_get_org_icu(pp2_charset_token_t prt, size_t *start, size_t *len) +{ + icu_iter_get_org_info(prt->iter, start, len); +} + #endif // YAZ_HAVE_ICU diff --git a/src/charsets.h b/src/charsets.h index b203ce4..3b8325b 100644 --- a/src/charsets.h +++ b/src/charsets.h @@ -45,6 +45,7 @@ void pp2_charset_token_destroy(pp2_charset_token_t prt); const char *pp2_charset_token_next(pp2_charset_token_t prt); const char *pp2_get_sort(pp2_charset_token_t prt); const char *pp2_get_display(pp2_charset_token_t prt); +void pp2_get_org(pp2_charset_token_t prt, size_t *start, size_t *len); #endif diff --git a/src/http_command.c b/src/http_command.c index 56e1f63..30de3d8 100644 --- a/src/http_command.c +++ b/src/http_command.c @@ -872,7 +872,8 @@ static void cmd_bytarget(struct http_channel *c) } static void write_metadata(WRBUF w, struct conf_service *service, - struct record_metadata **ml, int full, int indent) + struct record_metadata **ml, unsigned flags, + int indent) { int imeta; @@ -880,7 +881,7 @@ static void write_metadata(WRBUF w, struct conf_service *service, { struct conf_metadata *cmd = &service->metadata[imeta]; struct record_metadata *md; - if (!cmd->brief && !full) + if (!cmd->brief && !(flags & 1)) continue; for (md = ml[imeta]; md; md = md->next) { @@ -900,7 +901,10 @@ static void write_metadata(WRBUF w, struct conf_service *service, switch (cmd->type) { case Metadata_type_generic: - wrbuf_xmlputs(w, md->data.text.disp); + if (md->data.text.snippet && (flags & 2)) + wrbuf_puts(w, md->data.text.snippet); + else + wrbuf_xmlputs(w, md->data.text.disp); break; case Metadata_type_year: wrbuf_printf(w, "%d", md->data.number.min); @@ -917,7 +921,8 @@ static void write_metadata(WRBUF w, struct conf_service *service, } static void write_subrecord(struct record *r, WRBUF w, - struct conf_service *service, int show_details) + struct conf_service *service, unsigned flags, + int indent) { const char *name = session_setting_oneval( client_get_database(r->client), PZ_NAME); @@ -934,7 +939,7 @@ static void write_subrecord(struct record *r, WRBUF w, wrbuf_printf(w, "%u", r->checksum); wrbuf_puts(w, "\">\n"); - write_metadata(w, service, r->metadata, show_details, 2); + write_metadata(w, service, r->metadata, flags, indent); wrbuf_puts(w, " \n"); } @@ -997,6 +1002,8 @@ static void show_record(struct http_channel *c, struct http_session *s) const char *offsetstr = http_argbyname(rq, "offset"); const char *binarystr = http_argbyname(rq, "binary"); const char *checksumstr = http_argbyname(rq, "checksum"); + const char *snippets = http_argbyname(rq, "snippets"); + unsigned flags = (snippets && *snippets == '1') ? 3 : 1; if (!s) return; @@ -1090,9 +1097,9 @@ static void show_record(struct http_channel *c, struct http_session *s) } wrbuf_printf(c->wrbuf, " %d\n", session_active_clients(s->psession)); - write_metadata(c->wrbuf, service, rec->metadata, 1, 1); + write_metadata(c->wrbuf, service, rec->metadata, flags, 1); for (r = rec->records; r; r = r->next) - write_subrecord(r, c->wrbuf, service, 2); + write_subrecord(r, c->wrbuf, service, flags, 2); response_close(c, "record"); } show_single_stop(s->psession, rec); @@ -1133,6 +1140,8 @@ static void show_records(struct http_channel *c, struct http_session *s, const char *num = http_argbyname(rq, "num"); const char *sort = http_argbyname(rq, "sort"); int version = get_version(rq); + const char *snippets = http_argbyname(rq, "snippets"); + unsigned flags = (snippets && *snippets == '1') ? 2 : 0; int startn = 0; int numn = 20; @@ -1187,9 +1196,9 @@ static void show_records(struct http_channel *c, struct http_session *s, struct conf_service *service = s->psession->service; wrbuf_puts(c->wrbuf, "\n"); - write_metadata(c->wrbuf, service, rec->metadata, 0, 1); + write_metadata(c->wrbuf, service, rec->metadata, flags, 1); for (ccount = 0, p = rl[i]->records; p; p = p->next, ccount++) - write_subrecord(p, c->wrbuf, service, 0); // subrecs w/o details + write_subrecord(p, c->wrbuf, service, flags, 2); wrbuf_printf(c->wrbuf, " %d\n", ccount); if (strstr(sort, "relevance")) { diff --git a/src/record.h b/src/record.h index 99a9e63..f2761f5 100644 --- a/src/record.h +++ b/src/record.h @@ -28,6 +28,7 @@ union data_types { struct { const char *disp; const char *sort; + const char *snippet; } text; struct { int min; diff --git a/src/relevance.c b/src/relevance.c index 0551980..4d5b6e4 100644 --- a/src/relevance.c +++ b/src/relevance.c @@ -83,6 +83,47 @@ static struct word_entry *word_entry_match(struct relevance *r, return 0; } +int relevance_snippet(struct relevance *r, + const char *words, const char *name, + WRBUF w_snippet) +{ + int no = 0; + const char *norm_str; +#if 1 + yaz_log(YLOG_LOG, "relevance_snippet for field=%s content=%s", + name, words); +#endif + pp2_charset_token_first(r->prt, words, 0); + + while ((norm_str = pp2_charset_token_next(r->prt))) + { + size_t org_start, org_len; + struct word_entry *entries = r->entries; + int highlight = 0; + int i; + + pp2_get_org(r->prt, &org_start, &org_len); + for (; entries; entries = entries->next, i++) + { + yaz_log(YLOG_LOG, "Compare: %s %s", norm_str, entries->norm_str); + if (*norm_str && !strcmp(norm_str, entries->norm_str)) + highlight = 1; + } + if (highlight) + wrbuf_puts(w_snippet, ""); + + wrbuf_xmlputs_n(w_snippet, words + org_start, org_len); + if (highlight) + wrbuf_puts(w_snippet, ""); + no += highlight; + } + if (no) + { + yaz_log(YLOG_LOG, "SNIPPET match: %s", wrbuf_cstr(w_snippet)); + } + return no; +} + void relevance_countwords(struct relevance *r, struct record_cluster *cluster, const char *words, const char *rank, const char *name) diff --git a/src/relevance.h b/src/relevance.h index 5a095eb..76bbc22 100644 --- a/src/relevance.h +++ b/src/relevance.h @@ -38,6 +38,10 @@ void relevance_newrec(struct relevance *r, struct record_cluster *cluster); void relevance_countwords(struct relevance *r, struct record_cluster *cluster, const char *words, const char *multiplier, const char *name); +int relevance_snippet(struct relevance *r, + const char *words, const char *name, + WRBUF w_snippet); + void relevance_donerecord(struct relevance *r, struct record_cluster *cluster); void relevance_prepare_read(struct relevance *rel, struct reclist *rec); diff --git a/src/session.c b/src/session.c index 880fc0c..e3e0a0d 100644 --- a/src/session.c +++ b/src/session.c @@ -1467,6 +1467,7 @@ static struct record_metadata *record_metadata_init( rec_md->data.text.disp = p; rec_md->data.text.sort = 0; + rec_md->data.text.snippet = 0; } else if (type == Metadata_type_year || type == Metadata_type_date) { @@ -1956,6 +1957,18 @@ static int ingest_to_cluster(struct client *cl, "for element '%s'", value, type); continue; } + + if (ser_md->type == Metadata_type_generic) + { + WRBUF w = wrbuf_alloc(); + if (relevance_snippet(se->relevance, + (char*) value, ser_md->name, w)) + rec_md->data.text.snippet = nmem_strdup(se->nmem, + wrbuf_cstr(w)); + wrbuf_destroy(w); + } + + wheretoput = &record->metadata[md_field_id]; while (*wheretoput) wheretoput = &(*wheretoput)->next; @@ -2183,7 +2196,6 @@ static int ingest_to_cluster(struct client *cl, relevance_countwords(se->relevance, cluster, (char *) value, rank, ser_md->name); } - // construct facets ... unless the client already has reported them if (ser_md->termlist && !client_has_facet(cl, (char *) type)) {