A little bit of speed up in the wordbreaker for full text indexing.

author Dave West <davew@uncensored.citadel.org>

Tue, 13 Nov 2007 02:37:45 +0000 (02:37 +0000)

committer Dave West <davew@uncensored.citadel.org>

Tue, 13 Nov 2007 02:37:45 +0000 (02:37 +0000)
author Dave West <davew@uncensored.citadel.org>
Tue, 13 Nov 2007 02:37:45 +0000 (02:37 +0000)
committer Dave West <davew@uncensored.citadel.org>
Tue, 13 Nov 2007 02:37:45 +0000 (02:37 +0000)
diff --git a/citadel/modules/fulltext/ft_wordbreaker.c b/citadel/modules/fulltext/ft_wordbreaker.c

index 6b9fb2d243b54122f4c8a6453ba77c6888b73777..12b0f667c00d1b111073cfe86da475adeaf0fe12 100644 (file)
--- a/citadel/modules/fulltext/ft_wordbreaker.c
+++ b/citadel/modules/fulltext/ft_wordbreaker.c
@@ -52,73 +52,37 @@
  static char *noise_words[] = {
         "about",
         "after",
-       "all",
         "also",
-       "an",
-       "and",
         "another",
-       "any",
-       "are",
-       "as",
-       "at",
-       "be",
         "because",
         "been",
         "before",
         "being",
         "between",
         "both",
-       "but",
-       "by",
         "came",
-       "can",
         "come",
         "could",
-       "did",
-       "do",
         "each",
-       "for",
         "from",
-       "get",
-       "got",
-       "had",
-       "has",
         "have",
-       "he",
-       "her",
         "here",
-       "him",
         "himself",
-       "his",
-       "how",
-       "if",
-       "in",
         "into",
-       "is",
-       "it",
         "like",
         "make",
         "many",
-       "me",
         "might",
         "more",
         "most",
         "much",
         "must",
-       "my",
         "never",
-       "now",
-       "of",
-       "on",
         "only",
-       "or",
         "other",
-       "our",
-       "out",
         "over",
         "said",
         "same",
-       "see",
         "should",
         "since",
         "some",
@@ -127,7 +91,6 @@ static char *noise_words[] = {
         "take",
         "than",
         "that",
-       "the",
         "their",
         "them",
         "then",
@@ -137,14 +100,8 @@ static char *noise_words[] = {
         "this",
         "those",
         "through",
-       "to",
-       "too",
         "under",
-       "up",
         "very",
-       "was",
-       "way",
-       "we",
         "well",
         "were",
         "what",
@@ -153,7 +110,6 @@ static char *noise_words[] = {
         "while",
         "with",
         "would",
-       "you",
         "your"
  };
  
@@ -212,34 +168,37 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                 ch = *ptr;
                 if ( (!isalnum(ch)) && (word_start) ) {
                         word_end = ptr;
-                       --word_end;
+//                     --word_end;
  
                         /* extract the word */
-                       word_len = word_end - word_start + 1;
-                       safestrncpy(word, word_start, sizeof word);
+                       word_len = word_end - word_start;
                         if (word_len >= sizeof word) {
                                 lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
-                               word[(sizeof word_len) - 1] = 0;
+                               safestrncpy(word, word_start, sizeof word);
+                               word[(sizeof word) - 1] = 0;
                         }
                         else {
+                               safestrncpy(word, word_start, word_len+1);
                                 word[word_len] = 0;
                         }
                         word_start = NULL;
  
-                       /* disqualify noise words */
-                       for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
-                               if (!strcasecmp(word, noise_words[i])) {
-                                       word_len = 0;
-                                       break;
-                               }
-                       }
-
                         /* are we ok with the length? */
                         if ( (word_len >= WB_MIN)
                            && (word_len <= WB_MAX) ) {
                                 for (i=0; i<word_len; ++i) {
                                         word[i] = tolower(word[i]);
                                 }
+                               /* disqualify noise words */
+                               for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
+                                       if (!strcmp(word, noise_words[i])) {
+                                               word_len = 0;
+                                               break;
+                                       }
+                               }
+                               if (word_len == 0)
+                                       continue;
+
                                 word_crc = (int)
                                         CalcCRC16Bytes(word_len, word);
  
diff --git a/citadel/modules/fulltext/ft_wordbreaker.h b/citadel/modules/fulltext/ft_wordbreaker.h

index 5f1fb99fe5efe183b2cecf4e695182e138217277..16f71456045d8f5b2675549b3e4434270d6681b3 100644 (file)
--- a/citadel/modules/fulltext/ft_wordbreaker.h
+++ b/citadel/modules/fulltext/ft_wordbreaker.h
@@ -9,12 +9,12 @@
   * later on, or even if we update this one, we can use a different ID so the
   * system knows it needs to throw away the existing index and rebuild it.
   */
-#define        FT_WORDBREAKER_ID       0x001f
+#define        FT_WORDBREAKER_ID       0x0021
  
  /*
   * Minimum and maximum length of words to index
   */
-#define WB_MIN                 3
+#define WB_MIN                 4       // nothing with 3 or less chars
  #define WB_MAX                 40
  
  void wordbreaker(char *text, int *num_tokens, int **tokens);
diff --git a/citadel/modules/fulltext/serv_fulltext.c b/citadel/modules/fulltext/serv_fulltext.c

index ba3f948a01af4885838f2918864738f11e2914eb..4e12ead4ae7c6513378e66a9c90eddc1f116c11e 100644 (file)
--- a/citadel/modules/fulltext/serv_fulltext.c
+++ b/citadel/modules/fulltext/serv_fulltext.c
@@ -219,7 +219,9 @@ void do_fulltext_indexing(void) {
         int i;
         static time_t last_index = 0L;
         static time_t last_progress = 0L;
-
+       time_t run_time = 0L;
+       time_t end_time = 0L;
+       
         /*
          * Don't do this if the site doesn't have it enabled.
          */
@@ -242,17 +244,18 @@ void do_fulltext_indexing(void) {
         if (CitControl.MMfulltext >= CitControl.MMhighest) {
                 return;         /* nothing to do! */
         }
-
-       lprintf(CTDL_DEBUG, "do_fulltext_indexing() started\n");
+       
+       run_time = time(NULL);
+       lprintf(CTDL_DEBUG, "do_fulltext_indexing() started (%ld)\n", run_time);
         
         /*
          * If we've switched wordbreaker modules, burn the index and start
          * over.
          */
         begin_critical_section(S_CONTROL);
-       lprintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n",
-                       CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID);
         if (CitControl.fulltext_wordbreaker != FT_WORDBREAKER_ID) {
+               lprintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n",
+                       CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID);
                 lprintf(CTDL_INFO, "(re)initializing full text index\n");
                 cdb_trunc(CDB_FULLTEXT);
                 CitControl.MMfulltext = 0L;
@@ -310,6 +313,8 @@ void do_fulltext_indexing(void) {
                 ft_num_alloc = 0;
                 ft_newmsgs = NULL;
         }
+       end_time = time(NULL);
+       lprintf(CTDL_DEBUG, "do_fulltext_indexing() duration (%ld)\n", end_time - run_time);
  
         /* Save our place so we don't have to do this again */
         ft_flush_cache();
author	Dave West <davew@uncensored.citadel.org>
	Tue, 13 Nov 2007 02:37:45 +0000 (02:37 +0000)
committer	Dave West <davew@uncensored.citadel.org>
	Tue, 13 Nov 2007 02:37:45 +0000 (02:37 +0000)
citadel/modules/fulltext/ft_wordbreaker.c		patch \| blob \| history
citadel/modules/fulltext/ft_wordbreaker.h		patch \| blob \| history
citadel/modules/fulltext/serv_fulltext.c		patch \| blob \| history