mk_module_init.sh now tests to see if echo supports -e and -E

[citadel.git] / citadel / ft_wordbreaker.c
diff --git a/citadel/ft_wordbreaker.c b/citadel/ft_wordbreaker.c

index 4fc3f3aace3373f878e69762eb281fa25a80d337..6b9fb2d243b54122f4c8a6453ba77c6888b73777 100644 (file)
--- a/citadel/ft_wordbreaker.c
+++ b/citadel/ft_wordbreaker.c
@@ -28,6 +28,7 @@
  #endif
  
  #include <sys/wait.h>
+#include <ctype.h>
  #include <string.h>
  #include <limits.h>
  #include "citadel.h"
@@ -36,12 +37,139 @@
  #include "citserver.h"
  #include "support.h"
  #include "config.h"
-#include "serv_extensions.h"
  #include "database.h"
  #include "msgbase.h"
  #include "control.h"
  #include "tools.h"
  #include "ft_wordbreaker.h"
+#include "crc16.h"
+
+/*
+ * Noise words are not included in search indices.
+ * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
+ * must also be changed, so that the index is rebuilt.
+ */
+static char *noise_words[] = {
+       "about",
+       "after",
+       "all",
+       "also",
+       "an",
+       "and",
+       "another",
+       "any",
+       "are",
+       "as",
+       "at",
+       "be",
+       "because",
+       "been",
+       "before",
+       "being",
+       "between",
+       "both",
+       "but",
+       "by",
+       "came",
+       "can",
+       "come",
+       "could",
+       "did",
+       "do",
+       "each",
+       "for",
+       "from",
+       "get",
+       "got",
+       "had",
+       "has",
+       "have",
+       "he",
+       "her",
+       "here",
+       "him",
+       "himself",
+       "his",
+       "how",
+       "if",
+       "in",
+       "into",
+       "is",
+       "it",
+       "like",
+       "make",
+       "many",
+       "me",
+       "might",
+       "more",
+       "most",
+       "much",
+       "must",
+       "my",
+       "never",
+       "now",
+       "of",
+       "on",
+       "only",
+       "or",
+       "other",
+       "our",
+       "out",
+       "over",
+       "said",
+       "same",
+       "see",
+       "should",
+       "since",
+       "some",
+       "still",
+       "such",
+       "take",
+       "than",
+       "that",
+       "the",
+       "their",
+       "them",
+       "then",
+       "there",
+       "these",
+       "they",
+       "this",
+       "those",
+       "through",
+       "to",
+       "too",
+       "under",
+       "up",
+       "very",
+       "was",
+       "way",
+       "we",
+       "well",
+       "were",
+       "what",
+       "where",
+       "which",
+       "while",
+       "with",
+       "would",
+       "you",
+       "your"
+};
+
+/*
+ * Compare function
+ */
+int intcmp(const void *rec1, const void *rec2) {
+       int i1, i2;
+
+       i1 = *(const int *)rec1;
+       i2 = *(const int *)rec2;
+
+       if (i1 > i2) return(1);
+       if (i1 < i2) return(-1);
+       return(0);
+}
  
  
  void wordbreaker(char *text, int *num_tokens, int **tokens) {
@@ -50,12 +178,93 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
         int wb_num_alloc = 0;
         int *wb_tokens = NULL;
  
-       wb_num_tokens = 3;
-       wb_tokens = malloc(wb_num_tokens * sizeof(int));
+       char *ptr;
+       char *word_start;
+       char *word_end;
+       char ch;
+       int word_len;
+       char word[256];
+       int i;
+       int word_crc;
+
+       if (text == NULL) {             /* no NULL text please */
+               *num_tokens = 0;
+               *tokens = NULL;
+               return;
+       }
+
+       if (text[0] == 0) {             /* no empty text either */
+               *num_tokens = 0;
+               *tokens = NULL;
+               return;
+       }
+
+       ptr = text;
+       word_start = NULL;
+       while (*ptr) {
+               ch = *ptr;
+               if (isalnum(ch)) {
+                       if (!word_start) {
+                               word_start = ptr;
+                       }
+               }
+               ++ptr;
+               ch = *ptr;
+               if ( (!isalnum(ch)) && (word_start) ) {
+                       word_end = ptr;
+                       --word_end;
+
+                       /* extract the word */
+                       word_len = word_end - word_start + 1;
+                       safestrncpy(word, word_start, sizeof word);
+                       if (word_len >= sizeof word) {
+                               lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
+                               word[(sizeof word_len) - 1] = 0;
+                       }
+                       else {
+                               word[word_len] = 0;
+                       }
+                       word_start = NULL;
+
+                       /* disqualify noise words */
+                       for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
+                               if (!strcasecmp(word, noise_words[i])) {
+                                       word_len = 0;
+                                       break;
+                               }
+                       }
+
+                       /* are we ok with the length? */
+                       if ( (word_len >= WB_MIN)
+                          && (word_len <= WB_MAX) ) {
+                               for (i=0; i<word_len; ++i) {
+                                       word[i] = tolower(word[i]);
+                               }
+                               word_crc = (int)
+                                       CalcCRC16Bytes(word_len, word);
+
+                               ++wb_num_tokens;
+                               if (wb_num_tokens > wb_num_alloc) {
+                                       wb_num_alloc += 512;
+                                       wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
+                               }
+                               wb_tokens[wb_num_tokens - 1] = word_crc;
+                       }
+               }
+       }
  
-       wb_tokens[0] = 6;
-       wb_tokens[1] = 7;       /* FIXME this obviously isn't a wordbreaker */
-       wb_tokens[2] = 8;
+       /* sort and purge dups */
+       if (wb_num_tokens > 1) {
+               qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
+               for (i=0; i<(wb_num_tokens-1); ++i) {
+                       if (wb_tokens[i] == wb_tokens[i+1]) {
+                               memmove(&wb_tokens[i], &wb_tokens[i+1],
+                                       ((wb_num_tokens - i - 1)*sizeof(int)));
+                               --wb_num_tokens;
+                               --i;
+                       }
+               }
+       }
  
         *num_tokens = wb_num_tokens;
         *tokens = wb_tokens;