/*
- * $Id$
- *
* Default wordbreaker module for full text indexing.
*
- * Copyright (c) 2005-2009 by the citadel.org team
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 3 of the License, or
- * (at your option) any later version.
+ * Copyright (c) 2005-2017 by the citadel.org team
*
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+ * This program is open source software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3.
*
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
*/
-
#include "sysdep.h"
#include <stdlib.h>
#include <unistd.h>
* NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
* must also be changed, so that the index is rebuilt.
*/
-
-noise_word *noise_words[26];
-
-static char *noise_words_init[] = {
+static char *noise_words[] = {
"about",
"after",
"also",
"would",
"your"
};
+#define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
-void initialize_noise_words(void)
-{
- int i;
- int len;
- int ch;
- noise_word *next;
-
- memset (noise_words, 0, sizeof(noise_words));
-
- for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
- {
- ch = noise_words_init[i][0] - 'a';
- len = strlen(noise_words_init[i]);
-
- next = malloc(sizeof(noise_word));
- next->len = len;
- next->word = strdup(noise_words_init[i]);
- next->next = noise_words[ch];
- noise_words[ch] = next;
- }
-}
-
-
-void noise_word_cleanup(void)
-{
- int i;
- noise_word *cur, *next;
-
- CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
-
- for (i = 0 ; i < 26 ; i++)
- {
- cur = noise_words[i];
- while (cur)
- {
- next = cur->next;
- free(cur->word);
- free(cur);
- cur = next;
- }
- }
-}
-
/*
* Compare function
*/
char word[256];
int i;
int word_crc;
- noise_word *noise;
-
if (text == NULL) { /* no NULL text please */
*num_tokens = 0;
ch = *ptr;
if ( (!isalnum(ch)) && (word_start) ) {
word_end = ptr;
-// --word_end;
/* extract the word */
word_len = word_end - word_start;
if (word_len >= sizeof word) {
- CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
+ syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
safestrncpy(word, word_start, sizeof word);
word[(sizeof word) - 1] = 0;
}
word_start = NULL;
/* are we ok with the length? */
- if ( (word_len >= WB_MIN)
- && (word_len <= WB_MAX) ) {
+ if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
for (i=0; i<word_len; ++i) {
word[i] = tolower(word[i]);
}
/* disqualify noise words */
- noise = noise_words[(int) (word[0]-'a')];
- while (noise)
- {
- if (noise->len == word_len)
- {
- if (!strcmp(word, noise->word))
- {
- word_len = 0;
- break;
- }
+ for (i=0; i<NUM_NOISE; ++i) {
+ if (!strcmp(word, noise_words[i])) {
+ word_len = 0;
+ break;
}
- noise = noise->next;
}
+
if (word_len == 0)
continue;
- word_crc = (int)
- CalcCRC16Bytes(word_len, word);
+ word_crc = (int) CalcCRC16Bytes(word_len, word);
++wb_num_tokens;
if (wb_num_tokens > wb_num_alloc) {