a580879157c9c0c3078df363714ff1cf38fea32a
[citadel.git] / citadel / ft_wordbreaker.c
1 /*
2  * $Id$
3  *
4  * Default wordbreaker module for full text indexing.
5  *
6  */
7
8
9 #include "sysdep.h"
10 #include <stdlib.h>
11 #include <unistd.h>
12 #include <stdio.h>
13 #include <fcntl.h>
14 #include <signal.h>
15 #include <pwd.h>
16 #include <errno.h>
17 #include <sys/types.h>
18
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
21 # include <time.h>
22 #else
23 # if HAVE_SYS_TIME_H
24 #  include <sys/time.h>
25 # else
26 #  include <time.h>
27 # endif
28 #endif
29
30 #include <sys/wait.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <limits.h>
34 #include "citadel.h"
35 #include "server.h"
36 #include "sysdep_decls.h"
37 #include "citserver.h"
38 #include "support.h"
39 #include "config.h"
40 #include "serv_extensions.h"
41 #include "database.h"
42 #include "msgbase.h"
43 #include "control.h"
44 #include "tools.h"
45 #include "ft_wordbreaker.h"
46 #include "crc16.h"
47
48 /*
49  * Noise words are not included in search indices.
50  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
51  * must also be changed, so that the index is rebuilt.
52  */
53 static char *noise_words[] = {
54         "about",
55         "after",
56         "all",
57         "also",
58         "an",
59         "and",
60         "another",
61         "any",
62         "are",
63         "as",
64         "at",
65         "be",
66         "because",
67         "been",
68         "before",
69         "being",
70         "between",
71         "both",
72         "but",
73         "by",
74         "came",
75         "can",
76         "come",
77         "could",
78         "did",
79         "do",
80         "each",
81         "for",
82         "from",
83         "get",
84         "got",
85         "had",
86         "has",
87         "have",
88         "he",
89         "her",
90         "here",
91         "him",
92         "himself",
93         "his",
94         "how",
95         "if",
96         "in",
97         "into",
98         "is",
99         "it",
100         "like",
101         "make",
102         "many",
103         "me",
104         "might",
105         "more",
106         "most",
107         "much",
108         "must",
109         "my",
110         "never",
111         "now",
112         "of",
113         "on",
114         "only",
115         "or",
116         "other",
117         "our",
118         "out",
119         "over",
120         "said",
121         "same",
122         "see",
123         "should",
124         "since",
125         "some",
126         "still",
127         "such",
128         "take",
129         "than",
130         "that",
131         "the",
132         "their",
133         "them",
134         "then",
135         "there",
136         "these",
137         "they",
138         "this",
139         "those",
140         "through",
141         "to",
142         "too",
143         "under",
144         "up",
145         "very",
146         "was",
147         "way",
148         "we",
149         "well",
150         "were",
151         "what",
152         "where",
153         "which",
154         "while",
155         "with",
156         "would",
157         "you",
158         "your"
159 };
160
161 /*
162  * Compare function
163  */
164 int intcmp(const void *rec1, const void *rec2) {
165         int i1, i2;
166
167         i1 = *(const int *)rec1;
168         i2 = *(const int *)rec2;
169
170         if (i1 > i2) return(1);
171         if (i1 < i2) return(-1);
172         return(0);
173 }
174
175
176 void wordbreaker(char *text, int *num_tokens, int **tokens) {
177
178         int wb_num_tokens = 0;
179         int wb_num_alloc = 0;
180         int *wb_tokens = NULL;
181
182         char *ptr;
183         char *word_start;
184         char *word_end;
185         char ch;
186         int word_len;
187         char word[256];
188         int i;
189         int word_crc;
190
191         if (text == NULL) {             /* no NULL text please */
192                 *num_tokens = 0;
193                 *tokens = NULL;
194                 return;
195         }
196
197         if (text[0] == 0) {             /* no empty text either */
198                 *num_tokens = 0;
199                 *tokens = NULL;
200                 return;
201         }
202
203         ptr = text;
204         word_start = NULL;
205         while (*ptr) {
206                 ch = *ptr;
207                 if (isalnum(ch)) {
208                         if (!word_start) {
209                                 word_start = ptr;
210                         }
211                 }
212                 ++ptr;
213                 ch = *ptr;
214                 if ( (!isalnum(ch)) && (word_start) ) {
215                         word_end = ptr;
216                         --word_end;
217
218                         /* extract the word */
219                         word_len = word_end - word_start + 1;
220                         safestrncpy(word, word_start, sizeof word);
221                         if (word_len >= sizeof word) {
222                                 lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
223                                 word[(sizeof word_len) - 1] = 0;
224                         }
225                         else {
226                                 word[word_len] = 0;
227                         }
228                         word_start = NULL;
229
230                         /* disqualify noise words */
231                         for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
232                                 if (!strcasecmp(word, noise_words[i])) {
233                                         word_len = 0;
234                                         break;
235                                 }
236                         }
237
238                         /* are we ok with the length? */
239                         if ( (word_len >= WB_MIN)
240                            && (word_len <= WB_MAX) ) {
241                                 for (i=0; i<word_len; ++i) {
242                                         word[i] = tolower(word[i]);
243                                 }
244                                 word_crc = (int)
245                                         CalcCRC16Bytes(word_len, word);
246
247                                 ++wb_num_tokens;
248                                 if (wb_num_tokens > wb_num_alloc) {
249                                         wb_num_alloc += 512;
250                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
251                                 }
252                                 wb_tokens[wb_num_tokens - 1] = word_crc;
253                         }
254                 }
255         }
256
257         /* sort and purge dups */
258         if (wb_num_tokens > 1) {
259                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
260                 for (i=0; i<(wb_num_tokens-1); ++i) {
261                         if (wb_tokens[i] == wb_tokens[i+1]) {
262                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
263                                         ((wb_num_tokens - i - 1)*sizeof(int)));
264                                 --wb_num_tokens;
265                                 --i;
266                         }
267                 }
268         }
269
270         *num_tokens = wb_num_tokens;
271         *tokens = wb_tokens;
272 }
273