*
* Default wordbreaker module for full text indexing.
*
+ * Copyright (c) 2005-2009 by the citadel.org team
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <ctype.h>
#include <string.h>
#include <limits.h>
+#include <libcitadel.h>
#include "citadel.h"
#include "server.h"
#include "sysdep_decls.h"
#include "database.h"
#include "msgbase.h"
#include "control.h"
-#include "tools.h"
#include "ft_wordbreaker.h"
#include "crc16.h"
+#include "ctdl_module.h"
/*
* Noise words are not included in search indices.
* NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
* must also be changed, so that the index is rebuilt.
*/
-static char *noise_words[] = {
+
+noise_word *noise_words[26];
+
+static char *noise_words_init[] = {
"about",
"after",
- "all",
"also",
- "an",
- "and",
"another",
- "any",
- "are",
- "as",
- "at",
- "be",
"because",
"been",
"before",
"being",
"between",
"both",
- "but",
- "by",
"came",
- "can",
"come",
"could",
- "did",
- "do",
"each",
- "for",
"from",
- "get",
- "got",
- "had",
- "has",
"have",
- "he",
- "her",
"here",
- "him",
"himself",
- "his",
- "how",
- "if",
- "in",
"into",
- "is",
- "it",
"like",
"make",
"many",
- "me",
"might",
"more",
"most",
"much",
"must",
- "my",
"never",
- "now",
- "of",
- "on",
"only",
- "or",
"other",
- "our",
- "out",
"over",
"said",
"same",
- "see",
"should",
"since",
"some",
"take",
"than",
"that",
- "the",
"their",
"them",
"then",
"this",
"those",
"through",
- "to",
- "too",
"under",
- "up",
"very",
- "was",
- "way",
- "we",
"well",
"were",
"what",
"while",
"with",
"would",
- "you",
"your"
};
+
+void initialize_noise_words(void)
+{
+ int i;
+ int len;
+ int ch;
+ noise_word *next;
+
+ memset (noise_words, 0, sizeof(noise_words));
+
+ for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
+ {
+ ch = noise_words_init[i][0] - 'a';
+ len = strlen(noise_words_init[i]);
+
+ next = malloc(sizeof(noise_word));
+ next->len = len;
+ next->word = strdup(noise_words_init[i]);
+ next->next = noise_words[ch];
+ noise_words[ch] = next;
+ }
+}
+
+
+void noise_word_cleanup(void)
+{
+ int i;
+ noise_word *cur, *next;
+
+ CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
+
+ for (i = 0 ; i < 26 ; i++)
+ {
+ cur = noise_words[i];
+ while (cur)
+ {
+ next = cur->next;
+ free(cur->word);
+ free(cur);
+ cur = next;
+ }
+ }
+}
+
/*
* Compare function
*/
char word[256];
int i;
int word_crc;
-
+ noise_word *noise;
+
+
if (text == NULL) { /* no NULL text please */
*num_tokens = 0;
*tokens = NULL;
ch = *ptr;
if ( (!isalnum(ch)) && (word_start) ) {
word_end = ptr;
- --word_end;
+// --word_end;
/* extract the word */
- word_len = word_end - word_start + 1;
- safestrncpy(word, word_start, sizeof word);
+ word_len = word_end - word_start;
if (word_len >= sizeof word) {
- lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
- word[(sizeof word_len) - 1] = 0;
+ CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
+ safestrncpy(word, word_start, sizeof word);
+ word[(sizeof word) - 1] = 0;
}
else {
+ safestrncpy(word, word_start, word_len+1);
word[word_len] = 0;
}
word_start = NULL;
- /* disqualify noise words */
- for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
- if (!strcasecmp(word, noise_words[i])) {
- word_len = 0;
- break;
- }
- }
-
/* are we ok with the length? */
if ( (word_len >= WB_MIN)
&& (word_len <= WB_MAX) ) {
for (i=0; i<word_len; ++i) {
word[i] = tolower(word[i]);
}
+ /* disqualify noise words */
+ noise = noise_words[(int) (word[0]-'a')];
+ while (noise)
+ {
+ if (noise->len == word_len)
+ {
+ if (!strcmp(word, noise->word))
+ {
+ word_len = 0;
+ break;
+ }
+ }
+ noise = noise->next;
+ }
+ if (word_len == 0)
+ continue;
+
word_crc = (int)
CalcCRC16Bytes(word_len, word);