2 * Default wordbreaker module for full text indexing.
4 * Copyright (c) 2005-2017 by the citadel.org team
6 * This program is open source software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 3.
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
15 #include "../../sysdep.h"
23 #include <sys/types.h>
29 #include <libcitadel.h>
30 #include "../../citadel_defs.h"
31 #include "../../server.h"
32 #include "../../sysdep_decls.h"
33 #include "../../citserver.h"
34 #include "../../support.h"
35 #include "../../config.h"
36 #include "../../database.h"
37 #include "../../msgbase.h"
38 #include "../../control.h"
39 #include "ft_wordbreaker.h"
41 #include "../../ctdl_module.h"
44 * Noise words are not included in search indices.
45 * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
46 * must also be changed, so that the index is rebuilt.
48 static char *noise_words[] = {
111 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
117 int intcmp(const void *rec1, const void *rec2) {
120 i1 = *(const int *)rec1;
121 i2 = *(const int *)rec2;
123 if (i1 > i2) return(1);
124 if (i1 < i2) return(-1);
129 Array *wordbreaker(const char *text) {
131 const char *word_start;
132 const char *word_end;
139 if (text == NULL) { /* no NULL text please */
143 if (text[0] == 0) { /* no empty text either */
147 Array *found_tokens = array_new(sizeof(int));
148 if (found_tokens == NULL) {
163 if ( (!isalnum(ch)) && (word_start) ) {
166 /* extract the word */
167 word_len = word_end - word_start;
168 if (word_len >= sizeof word) {
169 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
170 safestrncpy(word, word_start, sizeof word);
171 word[(sizeof word) - 1] = 0;
174 safestrncpy(word, word_start, word_len+1);
179 /* are we ok with the length? */
180 if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
181 for (i=0; i<word_len; ++i) {
182 word[i] = tolower(word[i]);
184 /* disqualify noise words */
185 for (i=0; i<NUM_NOISE; ++i) {
186 if (!strcasecmp(word, noise_words[i])) {
191 /* FIXME make this case insensitive */
192 /* add it to the array */
194 word_crc = (int) CalcCRC16Bytes(word_len, word);
195 array_append(found_tokens, &word_crc);
201 /* sort and purge dups */
202 if (array_len(found_tokens) > 1) {
203 array_sort(found_tokens, intcmp);
204 for (i=0; i<(array_len(found_tokens)); ++i) {
205 if (array_get_element_at(found_tokens, i) == array_get_element_at(found_tokens, i+1)) {
206 array_delete_element_at(found_tokens, i);
211 return(found_tokens);