Speed up for the indexer.
authorDave West <davew@uncensored.citadel.org>
Mon, 16 Jun 2008 17:25:53 +0000 (17:25 +0000)
committerDave West <davew@uncensored.citadel.org>
Mon, 16 Jun 2008 17:25:53 +0000 (17:25 +0000)
This will NOT cause a re-build of the index but it does significantly
improve performance when indexing a new message or if the index needs to
be re-built.
Basically the list of noise words is processed much faster.

citadel/modules/fulltext/ft_wordbreaker.c
citadel/modules/fulltext/ft_wordbreaker.h
citadel/modules/fulltext/serv_fulltext.c

index aaa92a208052925c017fb0da64063c570a40bd4e..664a7de8dc99c4b3794b8e051155d72209b23d2a 100644 (file)
  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  * must also be changed, so that the index is rebuilt.
  */
-static char *noise_words[] = {
+
+noise_word *noise_words[26];
+
+static char *noise_words_init[] = {
        "about",
        "after",
        "also",
@@ -114,6 +117,50 @@ static char *noise_words[] = {
        "your"
 };
 
+
+void initialize_noise_words(void)
+{
+       int i;
+       int len;
+       int ch;
+       noise_word *next;
+       
+       memset (noise_words, 0, sizeof(noise_words));
+       
+       for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
+       {
+               ch = noise_words_init[i][0] - 'a';
+               len = strlen(noise_words_init[i]);
+               
+               next = malloc(sizeof(noise_word));
+               next->len = len;
+               next->word = strdup(noise_words_init[i]);
+               next->next = noise_words[ch];
+               noise_words[ch] = next;
+       }
+}
+
+
+void noise_word_cleanup(void)
+{
+       int i;
+       noise_word *cur, *next;
+       
+       CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
+       
+       for (i = 0 ; i < 26 ; i++)
+       {
+               cur = noise_words[i];
+               while (cur)
+               {
+                       next = cur->next;
+                       free(cur->word);
+                       free(cur);
+                       cur = next;
+               }
+       }
+}
+
 /*
  * Compare function
  */
@@ -143,7 +190,9 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
        char word[256];
        int i;
        int word_crc;
-
+       noise_word *noise;
+       
+       
        if (text == NULL) {             /* no NULL text please */
                *num_tokens = 0;
                *tokens = NULL;
@@ -191,11 +240,18 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                                        word[i] = tolower(word[i]);
                                }
                                /* disqualify noise words */
-                               for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
-                                       if (!strcmp(word, noise_words[i])) {
-                                               word_len = 0;
-                                               break;
+                               noise = noise_words[(int) (word[0]-'a')];
+                               while (noise)
+                               {
+                                       if (noise->len == word_len)
+                                       {
+                                               if (!strcmp(word, noise->word)) 
+                                               {
+                                                       word_len = 0;
+                                                       break;
+                                               }
                                        }
+                                       noise = noise->next;
                                }
                                if (word_len == 0)
                                        continue;
index 16f71456045d8f5b2675549b3e4434270d6681b3..b643af57808698fd715003618f3e0e79c74c18fe 100644 (file)
 
 void wordbreaker(char *text, int *num_tokens, int **tokens);
 
+void initialize_noise_words(void);
+void noise_word_cleanup(void);
+
+
+typedef struct noise_word noise_word;
+
+struct noise_word {
+       unsigned int len;
+       char *word;
+       noise_word *next;
+};
index 280c6ca1f1218fbf5ee600557879b6bd8c74f84d..d6eac6cb7d3cfb11ab12417858755a1d0e127154 100644 (file)
@@ -248,7 +248,7 @@ void do_fulltext_indexing(void) {
         * Check to see whether the fulltext index is up to date; if there
         * are no messages to index, don't waste any more time trying.
         */
-       if (CitControl.MMfulltext >= CitControl.MMhighest) {
+       if ((CitControl.MMfulltext >= CitControl.MMhighest) && (CitControl.fulltext_wordbreaker == FT_WORDBREAKER_ID)) {
                return;         /* nothing to do! */
        }
        
@@ -496,9 +496,11 @@ CTDL_MODULE_INIT(fulltext)
        if (!threading)
        {
                initialize_ft_cache();
+               initialize_noise_words();
                CtdlRegisterProtoHook(cmd_srch, "SRCH", "Full text search");
                CtdlRegisterDeleteHook(ft_delete_remove);
                CtdlRegisterSearchFuncHook(ft_search, "fulltext");
+               CtdlRegisterCleanupHook(noise_word_cleanup);
        }
        else
        {