664a7de8dc99c4b3794b8e051155d72209b23d2a
[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
1 /*
2  * $Id$
3  *
4  * Default wordbreaker module for full text indexing.
5  *
6  */
7
8
9 #include "sysdep.h"
10 #include <stdlib.h>
11 #include <unistd.h>
12 #include <stdio.h>
13 #include <fcntl.h>
14 #include <signal.h>
15 #include <pwd.h>
16 #include <errno.h>
17 #include <sys/types.h>
18
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
21 # include <time.h>
22 #else
23 # if HAVE_SYS_TIME_H
24 #  include <sys/time.h>
25 # else
26 #  include <time.h>
27 # endif
28 #endif
29
30 #include <sys/wait.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <limits.h>
34 #include <libcitadel.h>
35 #include "citadel.h"
36 #include "server.h"
37 #include "sysdep_decls.h"
38 #include "citserver.h"
39 #include "support.h"
40 #include "config.h"
41 #include "database.h"
42 #include "msgbase.h"
43 #include "control.h"
44 #include "ft_wordbreaker.h"
45 #include "crc16.h"
46 #include "ctdl_module.h"
47
48 /*
49  * Noise words are not included in search indices.
50  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
51  * must also be changed, so that the index is rebuilt.
52  */
53
54 noise_word *noise_words[26];
55
56 static char *noise_words_init[] = {
57         "about",
58         "after",
59         "also",
60         "another",
61         "because",
62         "been",
63         "before",
64         "being",
65         "between",
66         "both",
67         "came",
68         "come",
69         "could",
70         "each",
71         "from",
72         "have",
73         "here",
74         "himself",
75         "into",
76         "like",
77         "make",
78         "many",
79         "might",
80         "more",
81         "most",
82         "much",
83         "must",
84         "never",
85         "only",
86         "other",
87         "over",
88         "said",
89         "same",
90         "should",
91         "since",
92         "some",
93         "still",
94         "such",
95         "take",
96         "than",
97         "that",
98         "their",
99         "them",
100         "then",
101         "there",
102         "these",
103         "they",
104         "this",
105         "those",
106         "through",
107         "under",
108         "very",
109         "well",
110         "were",
111         "what",
112         "where",
113         "which",
114         "while",
115         "with",
116         "would",
117         "your"
118 };
119
120
121 void initialize_noise_words(void)
122 {
123         int i;
124         int len;
125         int ch;
126         noise_word *next;
127         
128         memset (noise_words, 0, sizeof(noise_words));
129         
130         for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
131         {
132                 ch = noise_words_init[i][0] - 'a';
133                 len = strlen(noise_words_init[i]);
134                 
135                 next = malloc(sizeof(noise_word));
136                 next->len = len;
137                 next->word = strdup(noise_words_init[i]);
138                 next->next = noise_words[ch];
139                 noise_words[ch] = next;
140         }
141 }
142
143
144 void noise_word_cleanup(void)
145 {
146         int i;
147         noise_word *cur, *next;
148         
149         CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
150         
151         for (i = 0 ; i < 26 ; i++)
152         {
153                 cur = noise_words[i];
154                 while (cur)
155                 {
156                         next = cur->next;
157                         free(cur->word);
158                         free(cur);
159                         cur = next;
160                 }
161         }
162 }
163
164 /*
165  * Compare function
166  */
167 int intcmp(const void *rec1, const void *rec2) {
168         int i1, i2;
169
170         i1 = *(const int *)rec1;
171         i2 = *(const int *)rec2;
172
173         if (i1 > i2) return(1);
174         if (i1 < i2) return(-1);
175         return(0);
176 }
177
178
179 void wordbreaker(char *text, int *num_tokens, int **tokens) {
180
181         int wb_num_tokens = 0;
182         int wb_num_alloc = 0;
183         int *wb_tokens = NULL;
184
185         char *ptr;
186         char *word_start;
187         char *word_end;
188         char ch;
189         int word_len;
190         char word[256];
191         int i;
192         int word_crc;
193         noise_word *noise;
194         
195         
196         if (text == NULL) {             /* no NULL text please */
197                 *num_tokens = 0;
198                 *tokens = NULL;
199                 return;
200         }
201
202         if (text[0] == 0) {             /* no empty text either */
203                 *num_tokens = 0;
204                 *tokens = NULL;
205                 return;
206         }
207
208         ptr = text;
209         word_start = NULL;
210         while (*ptr) {
211                 ch = *ptr;
212                 if (isalnum(ch)) {
213                         if (!word_start) {
214                                 word_start = ptr;
215                         }
216                 }
217                 ++ptr;
218                 ch = *ptr;
219                 if ( (!isalnum(ch)) && (word_start) ) {
220                         word_end = ptr;
221 //                      --word_end;
222
223                         /* extract the word */
224                         word_len = word_end - word_start;
225                         if (word_len >= sizeof word) {
226                                 CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
227                                 safestrncpy(word, word_start, sizeof word);
228                                 word[(sizeof word) - 1] = 0;
229                         }
230                         else {
231                                 safestrncpy(word, word_start, word_len+1);
232                                 word[word_len] = 0;
233                         }
234                         word_start = NULL;
235
236                         /* are we ok with the length? */
237                         if ( (word_len >= WB_MIN)
238                            && (word_len <= WB_MAX) ) {
239                                 for (i=0; i<word_len; ++i) {
240                                         word[i] = tolower(word[i]);
241                                 }
242                                 /* disqualify noise words */
243                                 noise = noise_words[(int) (word[0]-'a')];
244                                 while (noise)
245                                 {
246                                         if (noise->len == word_len)
247                                         {
248                                                 if (!strcmp(word, noise->word)) 
249                                                 {
250                                                         word_len = 0;
251                                                         break;
252                                                 }
253                                         }
254                                         noise = noise->next;
255                                 }
256                                 if (word_len == 0)
257                                         continue;
258
259                                 word_crc = (int)
260                                         CalcCRC16Bytes(word_len, word);
261
262                                 ++wb_num_tokens;
263                                 if (wb_num_tokens > wb_num_alloc) {
264                                         wb_num_alloc += 512;
265                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
266                                 }
267                                 wb_tokens[wb_num_tokens - 1] = word_crc;
268                         }
269                 }
270         }
271
272         /* sort and purge dups */
273         if (wb_num_tokens > 1) {
274                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
275                 for (i=0; i<(wb_num_tokens-1); ++i) {
276                         if (wb_tokens[i] == wb_tokens[i+1]) {
277                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
278                                         ((wb_num_tokens - i - 1)*sizeof(int)));
279                                 --wb_num_tokens;
280                                 --i;
281                         }
282                 }
283         }
284
285         *num_tokens = wb_num_tokens;
286         *tokens = wb_tokens;
287 }
288