4 * Default wordbreaker module for full text indexing.
6 * Copyright (c) 2005-2009 by the citadel.org team
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 3 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32 #include <sys/types.h>
34 #if TIME_WITH_SYS_TIME
35 # include <sys/time.h>
39 # include <sys/time.h>
49 #include <libcitadel.h>
52 #include "sysdep_decls.h"
53 #include "citserver.h"
59 #include "ft_wordbreaker.h"
61 #include "ctdl_module.h"
64 * Noise words are not included in search indices.
65 * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
66 * must also be changed, so that the index is rebuilt.
69 noise_word *noise_words[26];
71 static char *noise_words_init[] = {
136 void initialize_noise_words(void)
143 memset (noise_words, 0, sizeof(noise_words));
145 for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
147 ch = noise_words_init[i][0] - 'a';
148 len = strlen(noise_words_init[i]);
150 next = malloc(sizeof(noise_word));
152 next->word = strdup(noise_words_init[i]);
153 next->next = noise_words[ch];
154 noise_words[ch] = next;
159 void noise_word_cleanup(void)
162 noise_word *cur, *next;
164 CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
166 for (i = 0 ; i < 26 ; i++)
168 cur = noise_words[i];
182 int intcmp(const void *rec1, const void *rec2) {
185 i1 = *(const int *)rec1;
186 i2 = *(const int *)rec2;
188 if (i1 > i2) return(1);
189 if (i1 < i2) return(-1);
194 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
196 int wb_num_tokens = 0;
197 int wb_num_alloc = 0;
198 int *wb_tokens = NULL;
201 const char *word_start;
202 const char *word_end;
211 if (text == NULL) { /* no NULL text please */
217 if (text[0] == 0) { /* no empty text either */
234 if ( (!isalnum(ch)) && (word_start) ) {
238 /* extract the word */
239 word_len = word_end - word_start;
240 if (word_len >= sizeof word) {
241 CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
242 safestrncpy(word, word_start, sizeof word);
243 word[(sizeof word) - 1] = 0;
246 safestrncpy(word, word_start, word_len+1);
251 /* are we ok with the length? */
252 if ( (word_len >= WB_MIN)
253 && (word_len <= WB_MAX) ) {
254 for (i=0; i<word_len; ++i) {
255 word[i] = tolower(word[i]);
257 /* disqualify noise words */
258 noise = noise_words[(int) (word[0]-'a')];
261 if (noise->len == word_len)
263 if (!strcmp(word, noise->word))
275 CalcCRC16Bytes(word_len, word);
278 if (wb_num_tokens > wb_num_alloc) {
280 wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
282 wb_tokens[wb_num_tokens - 1] = word_crc;
287 /* sort and purge dups */
288 if (wb_num_tokens > 1) {
289 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
290 for (i=0; i<(wb_num_tokens-1); ++i) {
291 if (wb_tokens[i] == wb_tokens[i+1]) {
292 memmove(&wb_tokens[i], &wb_tokens[i+1],
293 ((wb_num_tokens - i - 1)*sizeof(int)));
300 *num_tokens = wb_num_tokens;