mk_module_init.sh now tests to see if echo supports -e and -E

[citadel.git] / citadel / ft_wordbreaker.c
diff --git a/citadel/ft_wordbreaker.c b/citadel/ft_wordbreaker.c

index 6c69175d1b93b79066d9fa330bef48d33025dd9b..6b9fb2d243b54122f4c8a6453ba77c6888b73777 100644 (file)
--- a/citadel/ft_wordbreaker.c
+++ b/citadel/ft_wordbreaker.c
@@ -37,7 +37,6 @@
  #include "citserver.h"
  #include "support.h"
  #include "config.h"
-#include "serv_extensions.h"
  #include "database.h"
  #include "msgbase.h"
  #include "control.h"
@@ -45,6 +44,133 @@
  #include "ft_wordbreaker.h"
  #include "crc16.h"
  
+/*
+ * Noise words are not included in search indices.
+ * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
+ * must also be changed, so that the index is rebuilt.
+ */
+static char *noise_words[] = {
+       "about",
+       "after",
+       "all",
+       "also",
+       "an",
+       "and",
+       "another",
+       "any",
+       "are",
+       "as",
+       "at",
+       "be",
+       "because",
+       "been",
+       "before",
+       "being",
+       "between",
+       "both",
+       "but",
+       "by",
+       "came",
+       "can",
+       "come",
+       "could",
+       "did",
+       "do",
+       "each",
+       "for",
+       "from",
+       "get",
+       "got",
+       "had",
+       "has",
+       "have",
+       "he",
+       "her",
+       "here",
+       "him",
+       "himself",
+       "his",
+       "how",
+       "if",
+       "in",
+       "into",
+       "is",
+       "it",
+       "like",
+       "make",
+       "many",
+       "me",
+       "might",
+       "more",
+       "most",
+       "much",
+       "must",
+       "my",
+       "never",
+       "now",
+       "of",
+       "on",
+       "only",
+       "or",
+       "other",
+       "our",
+       "out",
+       "over",
+       "said",
+       "same",
+       "see",
+       "should",
+       "since",
+       "some",
+       "still",
+       "such",
+       "take",
+       "than",
+       "that",
+       "the",
+       "their",
+       "them",
+       "then",
+       "there",
+       "these",
+       "they",
+       "this",
+       "those",
+       "through",
+       "to",
+       "too",
+       "under",
+       "up",
+       "very",
+       "was",
+       "way",
+       "we",
+       "well",
+       "were",
+       "what",
+       "where",
+       "which",
+       "while",
+       "with",
+       "would",
+       "you",
+       "your"
+};
+
+/*
+ * Compare function
+ */
+int intcmp(const void *rec1, const void *rec2) {
+       int i1, i2;
+
+       i1 = *(const int *)rec1;
+       i2 = *(const int *)rec2;
+
+       if (i1 > i2) return(1);
+       if (i1 < i2) return(-1);
+       return(0);
+}
+
  
  void wordbreaker(char *text, int *num_tokens, int **tokens) {
  
@@ -75,40 +201,67 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
  
         ptr = text;
         word_start = NULL;
-       while (ptr++, *ptr) {
+       while (*ptr) {
                 ch = *ptr;
                 if (isalnum(ch)) {
                         if (!word_start) {
                                 word_start = ptr;
                         }
                 }
-               else {
-                       if (word_start) {
-                               word_end = ptr;
-                               --word_end;
-
-                               /* extract the word */
-                               word_len = word_end - word_start + 1;
-                               safestrncpy(word, word_start, sizeof word);
+               ++ptr;
+               ch = *ptr;
+               if ( (!isalnum(ch)) && (word_start) ) {
+                       word_end = ptr;
+                       --word_end;
+
+                       /* extract the word */
+                       word_len = word_end - word_start + 1;
+                       safestrncpy(word, word_start, sizeof word);
+                       if (word_len >= sizeof word) {
+                               lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
+                               word[(sizeof word_len) - 1] = 0;
+                       }
+                       else {
                                 word[word_len] = 0;
-                               word_start = NULL;
-
-                               /* are we ok with the length? */
-                               if ( (word_len >= WB_MIN)
-                                  && (word_len <= WB_MAX) ) {
-                                       for (i=0; i<word_len; ++i) {
-                                               word[i] = tolower(word[i]);
-                                       }
-                                       word_crc = (int)
-                                               CalcCRC16Bytes(word_len, word);
-
-                                       ++wb_num_tokens;
-                                       if (wb_num_tokens > wb_num_alloc) {
-                                               wb_num_alloc += 512;
-                                               wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
-                                       }
-                                       wb_tokens[wb_num_tokens - 1] = word_crc;
+                       }
+                       word_start = NULL;
+
+                       /* disqualify noise words */
+                       for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
+                               if (!strcasecmp(word, noise_words[i])) {
+                                       word_len = 0;
+                                       break;
+                               }
+                       }
+
+                       /* are we ok with the length? */
+                       if ( (word_len >= WB_MIN)
+                          && (word_len <= WB_MAX) ) {
+                               for (i=0; i<word_len; ++i) {
+                                       word[i] = tolower(word[i]);
                                 }
+                               word_crc = (int)
+                                       CalcCRC16Bytes(word_len, word);
+
+                               ++wb_num_tokens;
+                               if (wb_num_tokens > wb_num_alloc) {
+                                       wb_num_alloc += 512;
+                                       wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
+                               }
+                               wb_tokens[wb_num_tokens - 1] = word_crc;
+                       }
+               }
+       }
+
+       /* sort and purge dups */
+       if (wb_num_tokens > 1) {
+               qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
+               for (i=0; i<(wb_num_tokens-1); ++i) {
+                       if (wb_tokens[i] == wb_tokens[i+1]) {
+                               memmove(&wb_tokens[i], &wb_tokens[i+1],
+                                       ((wb_num_tokens - i - 1)*sizeof(int)));
+                               --wb_num_tokens;
+                               --i;
                         }
                 }
         }