mk_module_init.sh now tests to see if echo supports -e and -E

[citadel.git] / citadel / ft_wordbreaker.c
diff --git a/citadel/ft_wordbreaker.c b/citadel/ft_wordbreaker.c

index f330ac573b162dc189eb70c5e990de639cfa1f1a..6b9fb2d243b54122f4c8a6453ba77c6888b73777 100644 (file)
--- a/citadel/ft_wordbreaker.c
+++ b/citadel/ft_wordbreaker.c
@@ -37,7 +37,6 @@
  #include "citserver.h"
  #include "support.h"
  #include "config.h"
-#include "serv_extensions.h"
  #include "database.h"
  #include "msgbase.h"
  #include "control.h"
@@ -45,6 +44,133 @@
  #include "ft_wordbreaker.h"
  #include "crc16.h"
  
+/*
+ * Noise words are not included in search indices.
+ * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
+ * must also be changed, so that the index is rebuilt.
+ */
+static char *noise_words[] = {
+       "about",
+       "after",
+       "all",
+       "also",
+       "an",
+       "and",
+       "another",
+       "any",
+       "are",
+       "as",
+       "at",
+       "be",
+       "because",
+       "been",
+       "before",
+       "being",
+       "between",
+       "both",
+       "but",
+       "by",
+       "came",
+       "can",
+       "come",
+       "could",
+       "did",
+       "do",
+       "each",
+       "for",
+       "from",
+       "get",
+       "got",
+       "had",
+       "has",
+       "have",
+       "he",
+       "her",
+       "here",
+       "him",
+       "himself",
+       "his",
+       "how",
+       "if",
+       "in",
+       "into",
+       "is",
+       "it",
+       "like",
+       "make",
+       "many",
+       "me",
+       "might",
+       "more",
+       "most",
+       "much",
+       "must",
+       "my",
+       "never",
+       "now",
+       "of",
+       "on",
+       "only",
+       "or",
+       "other",
+       "our",
+       "out",
+       "over",
+       "said",
+       "same",
+       "see",
+       "should",
+       "since",
+       "some",
+       "still",
+       "such",
+       "take",
+       "than",
+       "that",
+       "the",
+       "their",
+       "them",
+       "then",
+       "there",
+       "these",
+       "they",
+       "this",
+       "those",
+       "through",
+       "to",
+       "too",
+       "under",
+       "up",
+       "very",
+       "was",
+       "way",
+       "we",
+       "well",
+       "were",
+       "what",
+       "where",
+       "which",
+       "while",
+       "with",
+       "would",
+       "you",
+       "your"
+};
+
+/*
+ * Compare function
+ */
+int intcmp(const void *rec1, const void *rec2) {
+       int i1, i2;
+
+       i1 = *(const int *)rec1;
+       i2 = *(const int *)rec2;
+
+       if (i1 > i2) return(1);
+       if (i1 < i2) return(-1);
+       return(0);
+}
+
  
  void wordbreaker(char *text, int *num_tokens, int **tokens) {
  
@@ -91,9 +217,23 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                         /* extract the word */
                         word_len = word_end - word_start + 1;
                         safestrncpy(word, word_start, sizeof word);
-                       word[word_len] = 0;
+                       if (word_len >= sizeof word) {
+                               lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
+                               word[(sizeof word_len) - 1] = 0;
+                       }
+                       else {
+                               word[word_len] = 0;
+                       }
                         word_start = NULL;
  
+                       /* disqualify noise words */
+                       for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
+                               if (!strcasecmp(word, noise_words[i])) {
+                                       word_len = 0;
+                                       break;
+                               }
+                       }
+
                         /* are we ok with the length? */
                         if ( (word_len >= WB_MIN)
                            && (word_len <= WB_MAX) ) {
@@ -113,6 +253,19 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                 }
         }
  
+       /* sort and purge dups */
+       if (wb_num_tokens > 1) {
+               qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
+               for (i=0; i<(wb_num_tokens-1); ++i) {
+                       if (wb_tokens[i] == wb_tokens[i+1]) {
+                               memmove(&wb_tokens[i], &wb_tokens[i+1],
+                                       ((wb_num_tokens - i - 1)*sizeof(int)));
+                               --wb_num_tokens;
+                               --i;
+                       }
+               }
+       }
+
         *num_tokens = wb_num_tokens;
         *tokens = wb_tokens;
  }