Moved to new module init structure.
[citadel.git] / citadel / ft_wordbreaker.c
1 /*
2  * $Id$
3  *
4  * Default wordbreaker module for full text indexing.
5  *
6  */
7
8
9 #include "sysdep.h"
10 #include <stdlib.h>
11 #include <unistd.h>
12 #include <stdio.h>
13 #include <fcntl.h>
14 #include <signal.h>
15 #include <pwd.h>
16 #include <errno.h>
17 #include <sys/types.h>
18
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
21 # include <time.h>
22 #else
23 # if HAVE_SYS_TIME_H
24 #  include <sys/time.h>
25 # else
26 #  include <time.h>
27 # endif
28 #endif
29
30 #include <sys/wait.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <limits.h>
34 #include "citadel.h"
35 #include "server.h"
36 #include "sysdep_decls.h"
37 #include "citserver.h"
38 #include "support.h"
39 #include "config.h"
40 #include "database.h"
41 #include "msgbase.h"
42 #include "control.h"
43 #include "tools.h"
44 #include "ft_wordbreaker.h"
45 #include "crc16.h"
46
47 /*
48  * Noise words are not included in search indices.
49  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
50  * must also be changed, so that the index is rebuilt.
51  */
52 static char *noise_words[] = {
53         "about",
54         "after",
55         "all",
56         "also",
57         "an",
58         "and",
59         "another",
60         "any",
61         "are",
62         "as",
63         "at",
64         "be",
65         "because",
66         "been",
67         "before",
68         "being",
69         "between",
70         "both",
71         "but",
72         "by",
73         "came",
74         "can",
75         "come",
76         "could",
77         "did",
78         "do",
79         "each",
80         "for",
81         "from",
82         "get",
83         "got",
84         "had",
85         "has",
86         "have",
87         "he",
88         "her",
89         "here",
90         "him",
91         "himself",
92         "his",
93         "how",
94         "if",
95         "in",
96         "into",
97         "is",
98         "it",
99         "like",
100         "make",
101         "many",
102         "me",
103         "might",
104         "more",
105         "most",
106         "much",
107         "must",
108         "my",
109         "never",
110         "now",
111         "of",
112         "on",
113         "only",
114         "or",
115         "other",
116         "our",
117         "out",
118         "over",
119         "said",
120         "same",
121         "see",
122         "should",
123         "since",
124         "some",
125         "still",
126         "such",
127         "take",
128         "than",
129         "that",
130         "the",
131         "their",
132         "them",
133         "then",
134         "there",
135         "these",
136         "they",
137         "this",
138         "those",
139         "through",
140         "to",
141         "too",
142         "under",
143         "up",
144         "very",
145         "was",
146         "way",
147         "we",
148         "well",
149         "were",
150         "what",
151         "where",
152         "which",
153         "while",
154         "with",
155         "would",
156         "you",
157         "your"
158 };
159
160 /*
161  * Compare function
162  */
163 int intcmp(const void *rec1, const void *rec2) {
164         int i1, i2;
165
166         i1 = *(const int *)rec1;
167         i2 = *(const int *)rec2;
168
169         if (i1 > i2) return(1);
170         if (i1 < i2) return(-1);
171         return(0);
172 }
173
174
175 void wordbreaker(char *text, int *num_tokens, int **tokens) {
176
177         int wb_num_tokens = 0;
178         int wb_num_alloc = 0;
179         int *wb_tokens = NULL;
180
181         char *ptr;
182         char *word_start;
183         char *word_end;
184         char ch;
185         int word_len;
186         char word[256];
187         int i;
188         int word_crc;
189
190         if (text == NULL) {             /* no NULL text please */
191                 *num_tokens = 0;
192                 *tokens = NULL;
193                 return;
194         }
195
196         if (text[0] == 0) {             /* no empty text either */
197                 *num_tokens = 0;
198                 *tokens = NULL;
199                 return;
200         }
201
202         ptr = text;
203         word_start = NULL;
204         while (*ptr) {
205                 ch = *ptr;
206                 if (isalnum(ch)) {
207                         if (!word_start) {
208                                 word_start = ptr;
209                         }
210                 }
211                 ++ptr;
212                 ch = *ptr;
213                 if ( (!isalnum(ch)) && (word_start) ) {
214                         word_end = ptr;
215                         --word_end;
216
217                         /* extract the word */
218                         word_len = word_end - word_start + 1;
219                         safestrncpy(word, word_start, sizeof word);
220                         if (word_len >= sizeof word) {
221                                 lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
222                                 word[(sizeof word_len) - 1] = 0;
223                         }
224                         else {
225                                 word[word_len] = 0;
226                         }
227                         word_start = NULL;
228
229                         /* disqualify noise words */
230                         for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
231                                 if (!strcasecmp(word, noise_words[i])) {
232                                         word_len = 0;
233                                         break;
234                                 }
235                         }
236
237                         /* are we ok with the length? */
238                         if ( (word_len >= WB_MIN)
239                            && (word_len <= WB_MAX) ) {
240                                 for (i=0; i<word_len; ++i) {
241                                         word[i] = tolower(word[i]);
242                                 }
243                                 word_crc = (int)
244                                         CalcCRC16Bytes(word_len, word);
245
246                                 ++wb_num_tokens;
247                                 if (wb_num_tokens > wb_num_alloc) {
248                                         wb_num_alloc += 512;
249                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
250                                 }
251                                 wb_tokens[wb_num_tokens - 1] = word_crc;
252                         }
253                 }
254         }
255
256         /* sort and purge dups */
257         if (wb_num_tokens > 1) {
258                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
259                 for (i=0; i<(wb_num_tokens-1); ++i) {
260                         if (wb_tokens[i] == wb_tokens[i+1]) {
261                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
262                                         ((wb_num_tokens - i - 1)*sizeof(int)));
263                                 --wb_num_tokens;
264                                 --i;
265                         }
266                 }
267         }
268
269         *num_tokens = wb_num_tokens;
270         *tokens = wb_tokens;
271 }
272