1 // Default wordbreaker module for full text indexing.
3 // Copyright (c) 2005-2024 by the citadel.org team
4 // This program is open source software. Use, duplication, or disclosure is subject to the GNU General Public License v3.
6 #include "../../sysdep.h"
14 #include <sys/types.h>
20 #include <libcitadel.h>
21 #include "../../citadel_defs.h"
22 #include "../../server.h"
23 #include "../../sysdep_decls.h"
24 #include "../../citserver.h"
25 #include "../../support.h"
26 #include "../../config.h"
27 #include "../../database.h"
28 #include "../../msgbase.h"
29 #include "../../control.h"
30 #include "ft_wordbreaker.h"
32 #include "../../ctdl_module.h"
34 // Noise words are not included in search indices.
35 // NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID must also be changed, so that the index is rebuilt.
36 static char *noise_words[] = {
99 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
103 int intcmp(const void *rec1, const void *rec2) {
106 i1 = *(const int *)rec1;
107 i2 = *(const int *)rec2;
109 if (i1 > i2) return(1);
110 if (i1 < i2) return(-1);
115 Array *wordbreaker(const char *text) {
117 const char *word_start;
118 const char *word_end;
125 if (text == NULL) { // no NULL text please
129 if (text[0] == 0) { // no empty text either
133 Array *found_tokens = array_new(sizeof(int));
134 if (found_tokens == NULL) {
149 if ( (!isalnum(ch)) && (word_start) ) {
153 word_len = word_end - word_start;
154 if (word_len >= sizeof word) {
155 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
156 safestrncpy(word, word_start, sizeof word);
157 word[(sizeof word) - 1] = 0;
160 safestrncpy(word, word_start, word_len+1);
165 // are we ok with the length?
166 if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
167 for (i=0; i<word_len; ++i) {
168 word[i] = tolower(word[i]);
170 // disqualify noise words
171 for (i=0; i<NUM_NOISE; ++i) {
172 if (!strcasecmp(word, noise_words[i])) {
177 // add it to the array (FIXME make this case insensitive)
179 word_crc = (int) CalcCRC16Bytes(word_len, word);
180 array_append(found_tokens, &word_crc);
186 // sort and purge dups
187 if (array_len(found_tokens) > 1) {
188 array_sort(found_tokens, intcmp);
189 for (i=0; i<(array_len(found_tokens)); ++i) {
190 if (array_get_element_at(found_tokens, i) == array_get_element_at(found_tokens, i+1)) {
191 array_delete_element_at(found_tokens, i);
196 return(found_tokens);