* More license declarations
[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
index 12b0f667c00d1b111073cfe86da475adeaf0fe12..0e8dade3bc9747b103bbb3f12100ea4cd3a70f43 100644 (file)
@@ -3,6 +3,21 @@
  *
  * Default wordbreaker module for full text indexing.
  *
+ * Copyright (c) 2005-2009 by the citadel.org team
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 
@@ -31,6 +46,7 @@
 #include <ctype.h>
 #include <string.h>
 #include <limits.h>
+#include <libcitadel.h>
 #include "citadel.h"
 #include "server.h"
 #include "sysdep_decls.h"
 #include "database.h"
 #include "msgbase.h"
 #include "control.h"
-#include "tools.h"
 #include "ft_wordbreaker.h"
 #include "crc16.h"
+#include "ctdl_module.h"
 
 /*
  * Noise words are not included in search indices.
  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  * must also be changed, so that the index is rebuilt.
  */
-static char *noise_words[] = {
+
+noise_word *noise_words[26];
+
+static char *noise_words_init[] = {
        "about",
        "after",
        "also",
@@ -113,6 +132,50 @@ static char *noise_words[] = {
        "your"
 };
 
+
+void initialize_noise_words(void)
+{
+       int i;
+       int len;
+       int ch;
+       noise_word *next;
+       
+       memset (noise_words, 0, sizeof(noise_words));
+       
+       for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
+       {
+               ch = noise_words_init[i][0] - 'a';
+               len = strlen(noise_words_init[i]);
+               
+               next = malloc(sizeof(noise_word));
+               next->len = len;
+               next->word = strdup(noise_words_init[i]);
+               next->next = noise_words[ch];
+               noise_words[ch] = next;
+       }
+}
+
+
+void noise_word_cleanup(void)
+{
+       int i;
+       noise_word *cur, *next;
+       
+       CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
+       
+       for (i = 0 ; i < 26 ; i++)
+       {
+               cur = noise_words[i];
+               while (cur)
+               {
+                       next = cur->next;
+                       free(cur->word);
+                       free(cur);
+                       cur = next;
+               }
+       }
+}
+
 /*
  * Compare function
  */
@@ -142,7 +205,9 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
        char word[256];
        int i;
        int word_crc;
-
+       noise_word *noise;
+       
+       
        if (text == NULL) {             /* no NULL text please */
                *num_tokens = 0;
                *tokens = NULL;
@@ -173,7 +238,7 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                        /* extract the word */
                        word_len = word_end - word_start;
                        if (word_len >= sizeof word) {
-                               lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
+                               CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
                                safestrncpy(word, word_start, sizeof word);
                                word[(sizeof word) - 1] = 0;
                        }
@@ -190,11 +255,18 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                                        word[i] = tolower(word[i]);
                                }
                                /* disqualify noise words */
-                               for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
-                                       if (!strcmp(word, noise_words[i])) {
-                                               word_len = 0;
-                                               break;
+                               noise = noise_words[(int) (word[0]-'a')];
+                               while (noise)
+                               {
+                                       if (noise->len == word_len)
+                                       {
+                                               if (!strcmp(word, noise->word)) 
+                                               {
+                                                       word_len = 0;
+                                                       break;
+                                               }
                                        }
+                                       noise = noise->next;
                                }
                                if (word_len == 0)
                                        continue;