From 4b100e1fae63305142ff41752417b6be73e38210 Mon Sep 17 00:00:00 2001 From: Dave West Date: Mon, 16 Jun 2008 17:25:53 +0000 Subject: [PATCH] Speed up for the indexer. This will NOT cause a re-build of the index but it does significantly improve performance when indexing a new message or if the index needs to be re-built. Basically the list of noise words is processed much faster. --- citadel/modules/fulltext/ft_wordbreaker.c | 68 +++++++++++++++++++++-- citadel/modules/fulltext/ft_wordbreaker.h | 11 ++++ citadel/modules/fulltext/serv_fulltext.c | 4 +- 3 files changed, 76 insertions(+), 7 deletions(-) diff --git a/citadel/modules/fulltext/ft_wordbreaker.c b/citadel/modules/fulltext/ft_wordbreaker.c index aaa92a208..664a7de8d 100644 --- a/citadel/modules/fulltext/ft_wordbreaker.c +++ b/citadel/modules/fulltext/ft_wordbreaker.c @@ -50,7 +50,10 @@ * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID * must also be changed, so that the index is rebuilt. */ -static char *noise_words[] = { + +noise_word *noise_words[26]; + +static char *noise_words_init[] = { "about", "after", "also", @@ -114,6 +117,50 @@ static char *noise_words[] = { "your" }; + +void initialize_noise_words(void) +{ + int i; + int len; + int ch; + noise_word *next; + + memset (noise_words, 0, sizeof(noise_words)); + + for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i) + { + ch = noise_words_init[i][0] - 'a'; + len = strlen(noise_words_init[i]); + + next = malloc(sizeof(noise_word)); + next->len = len; + next->word = strdup(noise_words_init[i]); + next->next = noise_words[ch]; + noise_words[ch] = next; + } +} + + +void noise_word_cleanup(void) +{ + int i; + noise_word *cur, *next; + + CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n"); + + for (i = 0 ; i < 26 ; i++) + { + cur = noise_words[i]; + while (cur) + { + next = cur->next; + free(cur->word); + free(cur); + cur = next; + } + } +} + /* * Compare function */ @@ -143,7 +190,9 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) { char word[256]; int i; int word_crc; - + noise_word *noise; + + if (text == NULL) { /* no NULL text please */ *num_tokens = 0; *tokens = NULL; @@ -191,11 +240,18 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) { word[i] = tolower(word[i]); } /* disqualify noise words */ - for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) { - if (!strcmp(word, noise_words[i])) { - word_len = 0; - break; + noise = noise_words[(int) (word[0]-'a')]; + while (noise) + { + if (noise->len == word_len) + { + if (!strcmp(word, noise->word)) + { + word_len = 0; + break; + } } + noise = noise->next; } if (word_len == 0) continue; diff --git a/citadel/modules/fulltext/ft_wordbreaker.h b/citadel/modules/fulltext/ft_wordbreaker.h index 16f714560..b643af578 100644 --- a/citadel/modules/fulltext/ft_wordbreaker.h +++ b/citadel/modules/fulltext/ft_wordbreaker.h @@ -19,3 +19,14 @@ void wordbreaker(char *text, int *num_tokens, int **tokens); +void initialize_noise_words(void); +void noise_word_cleanup(void); + + +typedef struct noise_word noise_word; + +struct noise_word { + unsigned int len; + char *word; + noise_word *next; +}; diff --git a/citadel/modules/fulltext/serv_fulltext.c b/citadel/modules/fulltext/serv_fulltext.c index 280c6ca1f..d6eac6cb7 100644 --- a/citadel/modules/fulltext/serv_fulltext.c +++ b/citadel/modules/fulltext/serv_fulltext.c @@ -248,7 +248,7 @@ void do_fulltext_indexing(void) { * Check to see whether the fulltext index is up to date; if there * are no messages to index, don't waste any more time trying. */ - if (CitControl.MMfulltext >= CitControl.MMhighest) { + if ((CitControl.MMfulltext >= CitControl.MMhighest) && (CitControl.fulltext_wordbreaker == FT_WORDBREAKER_ID)) { return; /* nothing to do! */ } @@ -496,9 +496,11 @@ CTDL_MODULE_INIT(fulltext) if (!threading) { initialize_ft_cache(); + initialize_noise_words(); CtdlRegisterProtoHook(cmd_srch, "SRCH", "Full text search"); CtdlRegisterDeleteHook(ft_delete_remove); CtdlRegisterSearchFuncHook(ft_search, "fulltext"); + CtdlRegisterCleanupHook(noise_word_cleanup); } else { -- 2.30.2