X-Git-Url: https://code.citadel.org/?a=blobdiff_plain;f=citadel%2Fmodules%2Ffulltext%2Fft_wordbreaker.c;h=b236de362643d94d3b61c605c7586c63bb4a8306;hb=7a9b0685e406cc83597171cc39d008c7e5459ca8;hp=44f40a45c58915a2b1748f696c0585b57b949f17;hpb=6106c1da54f0923550c7bdceb45246fb88e9ea19;p=citadel.git diff --git a/citadel/modules/fulltext/ft_wordbreaker.c b/citadel/modules/fulltext/ft_wordbreaker.c index 44f40a45c..b236de362 100644 --- a/citadel/modules/fulltext/ft_wordbreaker.c +++ b/citadel/modules/fulltext/ft_wordbreaker.c @@ -1,24 +1,17 @@ /* * Default wordbreaker module for full text indexing. * - * Copyright (c) 2005-2009 by the citadel.org team + * Copyright (c) 2005-2017 by the citadel.org team * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. + * This program is open source software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 3. * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. */ - #include "sysdep.h" #include #include @@ -28,18 +21,7 @@ #include #include #include - -#if TIME_WITH_SYS_TIME -# include -# include -#else -# if HAVE_SYS_TIME_H -# include -# else -# include -# endif -#endif - +#include #include #include #include @@ -63,10 +45,7 @@ * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID * must also be changed, so that the index is rebuilt. */ - -noise_word *noise_words[26]; - -static char *noise_words_init[] = { +static char *noise_words[] = { "about", "after", "also", @@ -129,51 +108,9 @@ static char *noise_words_init[] = { "would", "your" }; +#define NUM_NOISE (sizeof(noise_words) / sizeof(char *)) -void initialize_noise_words(void) -{ - int i; - int len; - int ch; - noise_word *next; - - memset (noise_words, 0, sizeof(noise_words)); - - for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i) - { - ch = noise_words_init[i][0] - 'a'; - len = strlen(noise_words_init[i]); - - next = malloc(sizeof(noise_word)); - next->len = len; - next->word = strdup(noise_words_init[i]); - next->next = noise_words[ch]; - noise_words[ch] = next; - } -} - - -void noise_word_cleanup(void) -{ - int i; - noise_word *cur, *next; - - CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n"); - - for (i = 0 ; i < 26 ; i++) - { - cur = noise_words[i]; - while (cur) - { - next = cur->next; - free(cur->word); - free(cur); - cur = next; - } - } -} - /* * Compare function */ @@ -203,8 +140,6 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) { char word[256]; int i; int word_crc; - noise_word *noise; - if (text == NULL) { /* no NULL text please */ *num_tokens = 0; @@ -231,12 +166,11 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) { ch = *ptr; if ( (!isalnum(ch)) && (word_start) ) { word_end = ptr; -// --word_end; /* extract the word */ word_len = word_end - word_start; if (word_len >= sizeof word) { - CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len); + syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len); safestrncpy(word, word_start, sizeof word); word[(sizeof word) - 1] = 0; } @@ -247,30 +181,22 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) { word_start = NULL; /* are we ok with the length? */ - if ( (word_len >= WB_MIN) - && (word_len <= WB_MAX) ) { + if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) { for (i=0; ilen == word_len) - { - if (!strcmp(word, noise->word)) - { - word_len = 0; - break; - } + for (i=0; inext; } + if (word_len == 0) continue; - word_crc = (int) - CalcCRC16Bytes(word_len, word); + word_crc = (int) CalcCRC16Bytes(word_len, word); ++wb_num_tokens; if (wb_num_tokens > wb_num_alloc) {