A little bit of speed up in the wordbreaker for full text indexing.
authorDave West <davew@uncensored.citadel.org>
Tue, 13 Nov 2007 02:37:45 +0000 (02:37 +0000)
committerDave West <davew@uncensored.citadel.org>
Tue, 13 Nov 2007 02:37:45 +0000 (02:37 +0000)
Also fixed a small bug that made some of our index tokens bad.
Maybe this will fix some of the text search problems some people are
seeing.
We also need to alter the indexer to index the message headers as well
as the body, at the moment we only index the body.

citadel/modules/fulltext/ft_wordbreaker.c
citadel/modules/fulltext/ft_wordbreaker.h
citadel/modules/fulltext/serv_fulltext.c

index 6b9fb2d243b54122f4c8a6453ba77c6888b73777..12b0f667c00d1b111073cfe86da475adeaf0fe12 100644 (file)
 static char *noise_words[] = {
        "about",
        "after",
-       "all",
        "also",
-       "an",
-       "and",
        "another",
-       "any",
-       "are",
-       "as",
-       "at",
-       "be",
        "because",
        "been",
        "before",
        "being",
        "between",
        "both",
-       "but",
-       "by",
        "came",
-       "can",
        "come",
        "could",
-       "did",
-       "do",
        "each",
-       "for",
        "from",
-       "get",
-       "got",
-       "had",
-       "has",
        "have",
-       "he",
-       "her",
        "here",
-       "him",
        "himself",
-       "his",
-       "how",
-       "if",
-       "in",
        "into",
-       "is",
-       "it",
        "like",
        "make",
        "many",
-       "me",
        "might",
        "more",
        "most",
        "much",
        "must",
-       "my",
        "never",
-       "now",
-       "of",
-       "on",
        "only",
-       "or",
        "other",
-       "our",
-       "out",
        "over",
        "said",
        "same",
-       "see",
        "should",
        "since",
        "some",
@@ -127,7 +91,6 @@ static char *noise_words[] = {
        "take",
        "than",
        "that",
-       "the",
        "their",
        "them",
        "then",
@@ -137,14 +100,8 @@ static char *noise_words[] = {
        "this",
        "those",
        "through",
-       "to",
-       "too",
        "under",
-       "up",
        "very",
-       "was",
-       "way",
-       "we",
        "well",
        "were",
        "what",
@@ -153,7 +110,6 @@ static char *noise_words[] = {
        "while",
        "with",
        "would",
-       "you",
        "your"
 };
 
@@ -212,34 +168,37 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                ch = *ptr;
                if ( (!isalnum(ch)) && (word_start) ) {
                        word_end = ptr;
-                       --word_end;
+//                     --word_end;
 
                        /* extract the word */
-                       word_len = word_end - word_start + 1;
-                       safestrncpy(word, word_start, sizeof word);
+                       word_len = word_end - word_start;
                        if (word_len >= sizeof word) {
                                lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
-                               word[(sizeof word_len) - 1] = 0;
+                               safestrncpy(word, word_start, sizeof word);
+                               word[(sizeof word) - 1] = 0;
                        }
                        else {
+                               safestrncpy(word, word_start, word_len+1);
                                word[word_len] = 0;
                        }
                        word_start = NULL;
 
-                       /* disqualify noise words */
-                       for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
-                               if (!strcasecmp(word, noise_words[i])) {
-                                       word_len = 0;
-                                       break;
-                               }
-                       }
-
                        /* are we ok with the length? */
                        if ( (word_len >= WB_MIN)
                           && (word_len <= WB_MAX) ) {
                                for (i=0; i<word_len; ++i) {
                                        word[i] = tolower(word[i]);
                                }
+                               /* disqualify noise words */
+                               for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
+                                       if (!strcmp(word, noise_words[i])) {
+                                               word_len = 0;
+                                               break;
+                                       }
+                               }
+                               if (word_len == 0)
+                                       continue;
+
                                word_crc = (int)
                                        CalcCRC16Bytes(word_len, word);
 
index 5f1fb99fe5efe183b2cecf4e695182e138217277..16f71456045d8f5b2675549b3e4434270d6681b3 100644 (file)
@@ -9,12 +9,12 @@
  * later on, or even if we update this one, we can use a different ID so the
  * system knows it needs to throw away the existing index and rebuild it.
  */
-#define        FT_WORDBREAKER_ID       0x001f
+#define        FT_WORDBREAKER_ID       0x0021
 
 /*
  * Minimum and maximum length of words to index
  */
-#define WB_MIN                 3
+#define WB_MIN                 4       // nothing with 3 or less chars
 #define WB_MAX                 40
 
 void wordbreaker(char *text, int *num_tokens, int **tokens);
index ba3f948a01af4885838f2918864738f11e2914eb..4e12ead4ae7c6513378e66a9c90eddc1f116c11e 100644 (file)
@@ -219,7 +219,9 @@ void do_fulltext_indexing(void) {
        int i;
        static time_t last_index = 0L;
        static time_t last_progress = 0L;
-
+       time_t run_time = 0L;
+       time_t end_time = 0L;
+       
        /*
         * Don't do this if the site doesn't have it enabled.
         */
@@ -242,17 +244,18 @@ void do_fulltext_indexing(void) {
        if (CitControl.MMfulltext >= CitControl.MMhighest) {
                return;         /* nothing to do! */
        }
-
-       lprintf(CTDL_DEBUG, "do_fulltext_indexing() started\n");
+       
+       run_time = time(NULL);
+       lprintf(CTDL_DEBUG, "do_fulltext_indexing() started (%ld)\n", run_time);
        
        /*
         * If we've switched wordbreaker modules, burn the index and start
         * over.
         */
        begin_critical_section(S_CONTROL);
-       lprintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n",
-                       CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID);
        if (CitControl.fulltext_wordbreaker != FT_WORDBREAKER_ID) {
+               lprintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n",
+                       CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID);
                lprintf(CTDL_INFO, "(re)initializing full text index\n");
                cdb_trunc(CDB_FULLTEXT);
                CitControl.MMfulltext = 0L;
@@ -310,6 +313,8 @@ void do_fulltext_indexing(void) {
                ft_num_alloc = 0;
                ft_newmsgs = NULL;
        }
+       end_time = time(NULL);
+       lprintf(CTDL_DEBUG, "do_fulltext_indexing() duration (%ld)\n", end_time - run_time);
 
        /* Save our place so we don't have to do this again */
        ft_flush_cache();