No more cleanup hooks. The OS can reclaim memory better than we can. We want to...

[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
diff --git a/citadel/modules/fulltext/ft_wordbreaker.c b/citadel/modules/fulltext/ft_wordbreaker.c

index 6b9fb2d243b54122f4c8a6453ba77c6888b73777..968fa9f02b0aa2bf21ddf87890a0e19887d8a56a 100644 (file)
--- a/citadel/modules/fulltext/ft_wordbreaker.c
+++ b/citadel/modules/fulltext/ft_wordbreaker.c
@@ -1,11 +1,17 @@
  /*
- * $Id$
- *
   * Default wordbreaker module for full text indexing.
   *
+ * Copyright (c) 2005-2017 by the citadel.org team
+ *
+ * This program is open source software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
   */
  
-
  #include "sysdep.h"
  #include <stdlib.h>
  #include <unistd.h>
@@ -31,6 +37,7 @@
  #include <ctype.h>
  #include <string.h>
  #include <limits.h>
+#include <libcitadel.h>
  #include "citadel.h"
  #include "server.h"
  #include "sysdep_decls.h"
@@ -40,85 +47,52 @@
  #include "database.h"
  #include "msgbase.h"
  #include "control.h"
-#include "tools.h"
  #include "ft_wordbreaker.h"
  #include "crc16.h"
+#include "ctdl_module.h"
  
  /*
   * Noise words are not included in search indices.
   * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
   * must also be changed, so that the index is rebuilt.
   */
-static char *noise_words[] = {
+
+noise_word *noise_words[26];
+
+static char *noise_words_init[] = {
         "about",
         "after",
-       "all",
         "also",
-       "an",
-       "and",
         "another",
-       "any",
-       "are",
-       "as",
-       "at",
-       "be",
         "because",
         "been",
         "before",
         "being",
         "between",
         "both",
-       "but",
-       "by",
         "came",
-       "can",
         "come",
         "could",
-       "did",
-       "do",
         "each",
-       "for",
         "from",
-       "get",
-       "got",
-       "had",
-       "has",
         "have",
-       "he",
-       "her",
         "here",
-       "him",
         "himself",
-       "his",
-       "how",
-       "if",
-       "in",
         "into",
-       "is",
-       "it",
         "like",
         "make",
         "many",
-       "me",
         "might",
         "more",
         "most",
         "much",
         "must",
-       "my",
         "never",
-       "now",
-       "of",
-       "on",
         "only",
-       "or",
         "other",
-       "our",
-       "out",
         "over",
         "said",
         "same",
-       "see",
         "should",
         "since",
         "some",
@@ -127,7 +101,6 @@ static char *noise_words[] = {
         "take",
         "than",
         "that",
-       "the",
         "their",
         "them",
         "then",
@@ -137,14 +110,8 @@ static char *noise_words[] = {
         "this",
         "those",
         "through",
-       "to",
-       "too",
         "under",
-       "up",
         "very",
-       "was",
-       "way",
-       "we",
         "well",
         "were",
         "what",
@@ -153,10 +120,33 @@ static char *noise_words[] = {
         "while",
         "with",
         "would",
-       "you",
         "your"
  };
  
+
+void initialize_noise_words(void)
+{
+       int i;
+       int len;
+       int ch;
+       noise_word *next;
+       
+       memset (noise_words, 0, sizeof(noise_words));
+       
+       for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
+       {
+               ch = noise_words_init[i][0] - 'a';
+               len = strlen(noise_words_init[i]);
+               
+               next = malloc(sizeof(noise_word));
+               next->len = len;
+               next->word = strdup(noise_words_init[i]);
+               next->next = noise_words[ch];
+               noise_words[ch] = next;
+       }
+}
+
+
  /*
   * Compare function
   */
@@ -172,21 +162,23 @@ int intcmp(const void *rec1, const void *rec2) {
  }
  
  
-void wordbreaker(char *text, int *num_tokens, int **tokens) {
+void wordbreaker(const char *text, int *num_tokens, int **tokens) {
  
         int wb_num_tokens = 0;
         int wb_num_alloc = 0;
         int *wb_tokens = NULL;
  
-       char *ptr;
-       char *word_start;
-       char *word_end;
+       const char *ptr;
+       const char *word_start;
+       const char *word_end;
         char ch;
         int word_len;
         char word[256];
         int i;
         int word_crc;
-
+       noise_word *noise;
+       
+       
         if (text == NULL) {             /* no NULL text please */
                 *num_tokens = 0;
                 *tokens = NULL;
@@ -212,34 +204,43 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                 ch = *ptr;
                 if ( (!isalnum(ch)) && (word_start) ) {
                         word_end = ptr;
-                       --word_end;
  
                         /* extract the word */
-                       word_len = word_end - word_start + 1;
-                       safestrncpy(word, word_start, sizeof word);
+                       word_len = word_end - word_start;
                         if (word_len >= sizeof word) {
-                               lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
-                               word[(sizeof word_len) - 1] = 0;
+                               syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
+                               safestrncpy(word, word_start, sizeof word);
+                               word[(sizeof word) - 1] = 0;
                         }
                         else {
+                               safestrncpy(word, word_start, word_len+1);
                                 word[word_len] = 0;
                         }
                         word_start = NULL;
  
-                       /* disqualify noise words */
-                       for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
-                               if (!strcasecmp(word, noise_words[i])) {
-                                       word_len = 0;
-                                       break;
-                               }
-                       }
-
                         /* are we ok with the length? */
                         if ( (word_len >= WB_MIN)
                            && (word_len <= WB_MAX) ) {
                                 for (i=0; i<word_len; ++i) {
                                         word[i] = tolower(word[i]);
                                 }
+                               /* disqualify noise words */
+                               noise = noise_words[(int) (word[0]-'a')];
+                               while (noise)
+                               {
+                                       if (noise->len == word_len)
+                                       {
+                                               if (!strcmp(word, noise->word)) 
+                                               {
+                                                       word_len = 0;
+                                                       break;
+                                               }
+                                       }
+                                       noise = noise->next;
+                               }
+                               if (word_len == 0)
+                                       continue;
+
                                 word_crc = (int)
                                         CalcCRC16Bytes(word_len, word);