4 * Default wordbreaker module for full text indexing.
17 #include <sys/types.h>
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
24 # include <sys/time.h>
36 #include "sysdep_decls.h"
37 #include "citserver.h"
40 #include "serv_extensions.h"
45 #include "ft_wordbreaker.h"
49 * Noise words are not included in search indices.
50 * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
51 * must also be changed, so that the index is rebuilt.
53 static char *noise_words[] = {
164 int intcmp(const void *rec1, const void *rec2) {
167 i1 = *(const int *)rec1;
168 i2 = *(const int *)rec2;
170 if (i1 > i2) return(1);
171 if (i1 < i2) return(-1);
176 void wordbreaker(char *text, int *num_tokens, int **tokens) {
178 int wb_num_tokens = 0;
179 int wb_num_alloc = 0;
180 int *wb_tokens = NULL;
191 if (text == NULL) { /* no NULL text please */
197 if (text[0] == 0) { /* no empty text either */
214 if ( (!isalnum(ch)) && (word_start) ) {
218 /* extract the word */
219 word_len = word_end - word_start + 1;
220 safestrncpy(word, word_start, sizeof word);
221 if (word_len >= sizeof word) {
222 lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
223 word[(sizeof word_len) - 1] = 0;
230 /* disqualify noise words */
231 for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
232 if (!strcasecmp(word, noise_words[i])) {
238 /* are we ok with the length? */
239 if ( (word_len >= WB_MIN)
240 && (word_len <= WB_MAX) ) {
241 for (i=0; i<word_len; ++i) {
242 word[i] = tolower(word[i]);
245 CalcCRC16Bytes(word_len, word);
248 if (wb_num_tokens > wb_num_alloc) {
250 wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
252 wb_tokens[wb_num_tokens - 1] = word_crc;
257 /* sort and purge dups */
258 if (wb_num_tokens > 1) {
259 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
260 for (i=0; i<(wb_num_tokens-1); ++i) {
261 if (wb_tokens[i] == wb_tokens[i+1]) {
262 memmove(&wb_tokens[i], &wb_tokens[i+1],
263 ((wb_num_tokens - i - 1)*sizeof(int)));
270 *num_tokens = wb_num_tokens;