/*
- * $Id$
- *
* Default wordbreaker module for full text indexing.
*
+ * Copyright (c) 2005-2012 by the citadel.org team
+ *
+ * This program is open source software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3.
+ *
+ *
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ *
+ *
*/
#include <ctype.h>
#include <string.h>
#include <limits.h>
+#include <libcitadel.h>
#include "citadel.h"
#include "server.h"
#include "sysdep_decls.h"
#include "database.h"
#include "msgbase.h"
#include "control.h"
-#include "tools.h"
#include "ft_wordbreaker.h"
#include "crc16.h"
+#include "ctdl_module.h"
/*
* Noise words are not included in search indices.
* NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
* must also be changed, so that the index is rebuilt.
*/
-static char *noise_words[] = {
+
+noise_word *noise_words[26];
+
+static char *noise_words_init[] = {
"about",
"after",
"also",
"your"
};
+
+void initialize_noise_words(void)
+{
+ int i;
+ int len;
+ int ch;
+ noise_word *next;
+
+ memset (noise_words, 0, sizeof(noise_words));
+
+ for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
+ {
+ ch = noise_words_init[i][0] - 'a';
+ len = strlen(noise_words_init[i]);
+
+ next = malloc(sizeof(noise_word));
+ next->len = len;
+ next->word = strdup(noise_words_init[i]);
+ next->next = noise_words[ch];
+ noise_words[ch] = next;
+ }
+}
+
+
+void noise_word_cleanup(void)
+{
+ int i;
+ noise_word *cur, *next;
+
+ syslog(LOG_INFO, "Cleaning up fulltext noise words.\n");
+
+ for (i = 0 ; i < 26 ; i++)
+ {
+ cur = noise_words[i];
+ while (cur)
+ {
+ next = cur->next;
+ free(cur->word);
+ free(cur);
+ cur = next;
+ }
+ }
+}
+
/*
* Compare function
*/
}
-void wordbreaker(char *text, int *num_tokens, int **tokens) {
+void wordbreaker(const char *text, int *num_tokens, int **tokens) {
int wb_num_tokens = 0;
int wb_num_alloc = 0;
int *wb_tokens = NULL;
- char *ptr;
- char *word_start;
- char *word_end;
+ const char *ptr;
+ const char *word_start;
+ const char *word_end;
char ch;
int word_len;
char word[256];
int i;
int word_crc;
-
+ noise_word *noise;
+
+
if (text == NULL) { /* no NULL text please */
*num_tokens = 0;
*tokens = NULL;
/* extract the word */
word_len = word_end - word_start;
if (word_len >= sizeof word) {
- lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
+ syslog(LOG_DEBUG, "Invalid word length: %d\n", word_len);
safestrncpy(word, word_start, sizeof word);
word[(sizeof word) - 1] = 0;
}
word[i] = tolower(word[i]);
}
/* disqualify noise words */
- for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
- if (!strcmp(word, noise_words[i])) {
- word_len = 0;
- break;
+ noise = noise_words[(int) (word[0]-'a')];
+ while (noise)
+ {
+ if (noise->len == word_len)
+ {
+ if (!strcmp(word, noise->word))
+ {
+ word_len = 0;
+ break;
+ }
}
+ noise = noise->next;
}
if (word_len == 0)
continue;