*
* Default wordbreaker module for full text indexing.
*
+ * Copyright (c) 2005-2009 by the citadel.org team
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <ctype.h>
#include <string.h>
#include <limits.h>
+#include <libcitadel.h>
#include "citadel.h"
#include "server.h"
#include "sysdep_decls.h"
#include "database.h"
#include "msgbase.h"
#include "control.h"
-#include "tools.h"
#include "ft_wordbreaker.h"
#include "crc16.h"
+#include "ctdl_module.h"
/*
* Noise words are not included in search indices.
* NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
* must also be changed, so that the index is rebuilt.
*/
-static char *noise_words[] = {
+
+noise_word *noise_words[26];
+
+static char *noise_words_init[] = {
"about",
"after",
"also",
"your"
};
+
+void initialize_noise_words(void)
+{
+ int i;
+ int len;
+ int ch;
+ noise_word *next;
+
+ memset (noise_words, 0, sizeof(noise_words));
+
+ for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
+ {
+ ch = noise_words_init[i][0] - 'a';
+ len = strlen(noise_words_init[i]);
+
+ next = malloc(sizeof(noise_word));
+ next->len = len;
+ next->word = strdup(noise_words_init[i]);
+ next->next = noise_words[ch];
+ noise_words[ch] = next;
+ }
+}
+
+
+void noise_word_cleanup(void)
+{
+ int i;
+ noise_word *cur, *next;
+
+ CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
+
+ for (i = 0 ; i < 26 ; i++)
+ {
+ cur = noise_words[i];
+ while (cur)
+ {
+ next = cur->next;
+ free(cur->word);
+ free(cur);
+ cur = next;
+ }
+ }
+}
+
/*
* Compare function
*/
char word[256];
int i;
int word_crc;
-
+ noise_word *noise;
+
+
if (text == NULL) { /* no NULL text please */
*num_tokens = 0;
*tokens = NULL;
/* extract the word */
word_len = word_end - word_start;
if (word_len >= sizeof word) {
- lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
+ CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
safestrncpy(word, word_start, sizeof word);
word[(sizeof word) - 1] = 0;
}
word[i] = tolower(word[i]);
}
/* disqualify noise words */
- for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
- if (!strcmp(word, noise_words[i])) {
- word_len = 0;
- break;
+ noise = noise_words[(int) (word[0]-'a')];
+ while (noise)
+ {
+ if (noise->len == word_len)
+ {
+ if (!strcmp(word, noise->word))
+ {
+ word_len = 0;
+ break;
+ }
}
+ noise = noise->next;
}
if (word_len == 0)
continue;