4 * Default wordbreaker module for full text indexing.
17 #include <sys/types.h>
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
24 # include <sys/time.h>
34 #include <libcitadel.h>
37 #include "sysdep_decls.h"
38 #include "citserver.h"
44 #include "ft_wordbreaker.h"
46 #include "ctdl_module.h"
49 * Noise words are not included in search indices.
50 * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
51 * must also be changed, so that the index is rebuilt.
54 noise_word *noise_words[26];
56 static char *noise_words_init[] = {
121 void initialize_noise_words(void)
128 memset (noise_words, 0, sizeof(noise_words));
130 for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
132 ch = noise_words_init[i][0] - 'a';
133 len = strlen(noise_words_init[i]);
135 next = malloc(sizeof(noise_word));
137 next->word = strdup(noise_words_init[i]);
138 next->next = noise_words[ch];
139 noise_words[ch] = next;
144 void noise_word_cleanup(void)
147 noise_word *cur, *next;
149 CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
151 for (i = 0 ; i < 26 ; i++)
153 cur = noise_words[i];
167 int intcmp(const void *rec1, const void *rec2) {
170 i1 = *(const int *)rec1;
171 i2 = *(const int *)rec2;
173 if (i1 > i2) return(1);
174 if (i1 < i2) return(-1);
179 void wordbreaker(char *text, int *num_tokens, int **tokens) {
181 int wb_num_tokens = 0;
182 int wb_num_alloc = 0;
183 int *wb_tokens = NULL;
196 if (text == NULL) { /* no NULL text please */
202 if (text[0] == 0) { /* no empty text either */
219 if ( (!isalnum(ch)) && (word_start) ) {
223 /* extract the word */
224 word_len = word_end - word_start;
225 if (word_len >= sizeof word) {
226 CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
227 safestrncpy(word, word_start, sizeof word);
228 word[(sizeof word) - 1] = 0;
231 safestrncpy(word, word_start, word_len+1);
236 /* are we ok with the length? */
237 if ( (word_len >= WB_MIN)
238 && (word_len <= WB_MAX) ) {
239 for (i=0; i<word_len; ++i) {
240 word[i] = tolower(word[i]);
242 /* disqualify noise words */
243 noise = noise_words[(int) (word[0]-'a')];
246 if (noise->len == word_len)
248 if (!strcmp(word, noise->word))
260 CalcCRC16Bytes(word_len, word);
263 if (wb_num_tokens > wb_num_alloc) {
265 wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
267 wb_tokens[wb_num_tokens - 1] = word_crc;
272 /* sort and purge dups */
273 if (wb_num_tokens > 1) {
274 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
275 for (i=0; i<(wb_num_tokens-1); ++i) {
276 if (wb_tokens[i] == wb_tokens[i+1]) {
277 memmove(&wb_tokens[i], &wb_tokens[i+1],
278 ((wb_num_tokens - i - 1)*sizeof(int)));
285 *num_tokens = wb_num_tokens;