2 * Default wordbreaker module for full text indexing.
4 * Copyright (c) 2005-2017 by the citadel.org team
6 * This program is open source software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 3.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
23 #include <sys/types.h>
25 #if TIME_WITH_SYS_TIME
26 # include <sys/time.h>
30 # include <sys/time.h>
40 #include <libcitadel.h>
43 #include "sysdep_decls.h"
44 #include "citserver.h"
50 #include "ft_wordbreaker.h"
52 #include "ctdl_module.h"
55 * Noise words are not included in search indices.
56 * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
57 * must also be changed, so that the index is rebuilt.
59 static char *noise_words[] = {
122 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
128 int intcmp(const void *rec1, const void *rec2) {
131 i1 = *(const int *)rec1;
132 i2 = *(const int *)rec2;
134 if (i1 > i2) return(1);
135 if (i1 < i2) return(-1);
140 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
142 int wb_num_tokens = 0;
143 int wb_num_alloc = 0;
144 int *wb_tokens = NULL;
147 const char *word_start;
148 const char *word_end;
155 if (text == NULL) { /* no NULL text please */
161 if (text[0] == 0) { /* no empty text either */
178 if ( (!isalnum(ch)) && (word_start) ) {
181 /* extract the word */
182 word_len = word_end - word_start;
183 if (word_len >= sizeof word) {
184 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
185 safestrncpy(word, word_start, sizeof word);
186 word[(sizeof word) - 1] = 0;
189 safestrncpy(word, word_start, word_len+1);
194 /* are we ok with the length? */
195 if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
196 for (i=0; i<word_len; ++i) {
197 word[i] = tolower(word[i]);
199 /* disqualify noise words */
200 for (i=0; i<NUM_NOISE; ++i) {
201 if (!strcmp(word, noise_words[i])) {
210 word_crc = (int) CalcCRC16Bytes(word_len, word);
213 if (wb_num_tokens > wb_num_alloc) {
215 wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
217 wb_tokens[wb_num_tokens - 1] = word_crc;
222 /* sort and purge dups */
223 if (wb_num_tokens > 1) {
224 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
225 for (i=0; i<(wb_num_tokens-1); ++i) {
226 if (wb_tokens[i] == wb_tokens[i+1]) {
227 memmove(&wb_tokens[i], &wb_tokens[i+1],
228 ((wb_num_tokens - i - 1)*sizeof(int)));
235 *num_tokens = wb_num_tokens;