2 * Default wordbreaker module for full text indexing.
4 * Copyright (c) 2005-2012 by the citadel.org team
6 * This program is open source software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 3.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
30 #include <sys/types.h>
32 #if TIME_WITH_SYS_TIME
33 # include <sys/time.h>
37 # include <sys/time.h>
47 #include <libcitadel.h>
50 #include "sysdep_decls.h"
51 #include "citserver.h"
57 #include "ft_wordbreaker.h"
59 #include "ctdl_module.h"
62 * Noise words are not included in search indices.
63 * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
64 * must also be changed, so that the index is rebuilt.
67 noise_word *noise_words[26];
69 static char *noise_words_init[] = {
134 void initialize_noise_words(void)
141 memset (noise_words, 0, sizeof(noise_words));
143 for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
145 ch = noise_words_init[i][0] - 'a';
146 len = strlen(noise_words_init[i]);
148 next = malloc(sizeof(noise_word));
150 next->word = strdup(noise_words_init[i]);
151 next->next = noise_words[ch];
152 noise_words[ch] = next;
157 void noise_word_cleanup(void)
160 noise_word *cur, *next;
162 syslog(LOG_INFO, "Cleaning up fulltext noise words.\n");
164 for (i = 0 ; i < 26 ; i++)
166 cur = noise_words[i];
180 int intcmp(const void *rec1, const void *rec2) {
183 i1 = *(const int *)rec1;
184 i2 = *(const int *)rec2;
186 if (i1 > i2) return(1);
187 if (i1 < i2) return(-1);
192 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
194 int wb_num_tokens = 0;
195 int wb_num_alloc = 0;
196 int *wb_tokens = NULL;
199 const char *word_start;
200 const char *word_end;
209 if (text == NULL) { /* no NULL text please */
215 if (text[0] == 0) { /* no empty text either */
232 if ( (!isalnum(ch)) && (word_start) ) {
236 /* extract the word */
237 word_len = word_end - word_start;
238 if (word_len >= sizeof word) {
239 syslog(LOG_DEBUG, "Invalid word length: %d\n", word_len);
240 safestrncpy(word, word_start, sizeof word);
241 word[(sizeof word) - 1] = 0;
244 safestrncpy(word, word_start, word_len+1);
249 /* are we ok with the length? */
250 if ( (word_len >= WB_MIN)
251 && (word_len <= WB_MAX) ) {
252 for (i=0; i<word_len; ++i) {
253 word[i] = tolower(word[i]);
255 /* disqualify noise words */
256 noise = noise_words[(int) (word[0]-'a')];
259 if (noise->len == word_len)
261 if (!strcmp(word, noise->word))
273 CalcCRC16Bytes(word_len, word);
276 if (wb_num_tokens > wb_num_alloc) {
278 wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
280 wb_tokens[wb_num_tokens - 1] = word_crc;
285 /* sort and purge dups */
286 if (wb_num_tokens > 1) {
287 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
288 for (i=0; i<(wb_num_tokens-1); ++i) {
289 if (wb_tokens[i] == wb_tokens[i+1]) {
290 memmove(&wb_tokens[i], &wb_tokens[i+1],
291 ((wb_num_tokens - i - 1)*sizeof(int)));
298 *num_tokens = wb_num_tokens;