]> code.citadel.org Git - citadel.git/blob - citadel/ft_wordbreaker.c
* ft_wordbreaker.c: added a list of 'noise words' to ignore. This is
[citadel.git] / citadel / ft_wordbreaker.c
1 /*
2  * $Id$
3  *
4  * Default wordbreaker module for full text indexing.
5  *
6  */
7
8
9 #include "sysdep.h"
10 #include <stdlib.h>
11 #include <unistd.h>
12 #include <stdio.h>
13 #include <fcntl.h>
14 #include <signal.h>
15 #include <pwd.h>
16 #include <errno.h>
17 #include <sys/types.h>
18
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
21 # include <time.h>
22 #else
23 # if HAVE_SYS_TIME_H
24 #  include <sys/time.h>
25 # else
26 #  include <time.h>
27 # endif
28 #endif
29
30 #include <sys/wait.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <limits.h>
34 #include "citadel.h"
35 #include "server.h"
36 #include "sysdep_decls.h"
37 #include "citserver.h"
38 #include "support.h"
39 #include "config.h"
40 #include "serv_extensions.h"
41 #include "database.h"
42 #include "msgbase.h"
43 #include "control.h"
44 #include "tools.h"
45 #include "ft_wordbreaker.h"
46 #include "crc16.h"
47
48 /*
49  * Noise words are not included in search indices.
50  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
51  * must also be changed, so that the index is rebuilt.
52  */
53 static char *noise_words[] = {
54         "about",
55         "after",
56         "all",
57         "also",
58         "an",
59         "and",
60         "another",
61         "any",
62         "are",
63         "as",
64         "at",
65         "be",
66         "because",
67         "been",
68         "before",
69         "being",
70         "between",
71         "both",
72         "but",
73         "by",
74         "came",
75         "can",
76         "come",
77         "could",
78         "did",
79         "do",
80         "each",
81         "for",
82         "from",
83         "get",
84         "got",
85         "had",
86         "has",
87         "have",
88         "he",
89         "her",
90         "here",
91         "him",
92         "himself",
93         "his",
94         "how",
95         "if",
96         "in",
97         "into",
98         "is",
99         "it",
100         "like",
101         "make",
102         "many",
103         "me",
104         "might",
105         "more",
106         "most",
107         "much",
108         "must",
109         "my",
110         "never",
111         "now",
112         "of",
113         "on",
114         "only",
115         "or",
116         "other",
117         "our",
118         "out",
119         "over",
120         "said",
121         "same",
122         "see",
123         "should",
124         "since",
125         "some",
126         "still",
127         "such",
128         "take",
129         "than",
130         "that",
131         "the",
132         "their",
133         "them",
134         "then",
135         "there",
136         "these",
137         "they",
138         "this",
139         "those",
140         "through",
141         "to",
142         "too",
143         "under",
144         "up",
145         "very",
146         "was",
147         "way",
148         "we",
149         "well",
150         "were",
151         "what",
152         "where",
153         "which",
154         "while",
155         "with",
156         "would",
157         "you",
158         "your"
159 };
160
161 /*
162  * Compare function
163  */
164 int intcmp(const void *rec1, const void *rec2) {
165         int i1, i2;
166
167         i1 = *(const int *)rec1;
168         i2 = *(const int *)rec2;
169
170         if (i1 > i2) return(1);
171         if (i1 < i2) return(-1);
172         return(0);
173 }
174
175
176 void wordbreaker(char *text, int *num_tokens, int **tokens) {
177
178         int wb_num_tokens = 0;
179         int wb_num_alloc = 0;
180         int *wb_tokens = NULL;
181
182         char *ptr;
183         char *word_start;
184         char *word_end;
185         char ch;
186         int word_len;
187         char word[256];
188         int i;
189         int word_crc;
190
191         if (text == NULL) {             /* no NULL text please */
192                 *num_tokens = 0;
193                 *tokens = NULL;
194                 return;
195         }
196
197         if (text[0] == 0) {             /* no empty text either */
198                 *num_tokens = 0;
199                 *tokens = NULL;
200                 return;
201         }
202
203         ptr = text;
204         word_start = NULL;
205         while (*ptr) {
206                 ch = *ptr;
207                 if (isalnum(ch)) {
208                         if (!word_start) {
209                                 word_start = ptr;
210                         }
211                 }
212                 ++ptr;
213                 ch = *ptr;
214                 if ( (!isalnum(ch)) && (word_start) ) {
215                         word_end = ptr;
216                         --word_end;
217
218                         /* extract the word */
219                         word_len = word_end - word_start + 1;
220                         safestrncpy(word, word_start, sizeof word);
221                         word[word_len] = 0;
222                         word_start = NULL;
223
224                         /* disqualify noise words */
225                         for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
226                                 if (!strcasecmp(word, noise_words[i])) {
227                                         word_len = 0;
228                                         break;
229                                 }
230                         }
231
232                         /* are we ok with the length? */
233                         if ( (word_len >= WB_MIN)
234                            && (word_len <= WB_MAX) ) {
235                                 for (i=0; i<word_len; ++i) {
236                                         word[i] = tolower(word[i]);
237                                 }
238                                 word_crc = (int)
239                                         CalcCRC16Bytes(word_len, word);
240
241                                 ++wb_num_tokens;
242                                 if (wb_num_tokens > wb_num_alloc) {
243                                         wb_num_alloc += 512;
244                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
245                                 }
246                                 wb_tokens[wb_num_tokens - 1] = word_crc;
247                         }
248                 }
249         }
250
251         /* sort and purge dups */
252         if (wb_num_tokens > 1) {
253                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
254                 for (i=0; i<(wb_num_tokens-1); ++i) {
255                         if (wb_tokens[i] == wb_tokens[i+1]) {
256                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
257                                         ((wb_num_tokens - i - 1)*sizeof(int)));
258                                 --wb_num_tokens;
259                                 --i;
260                         }
261                 }
262         }
263
264         *num_tokens = wb_num_tokens;
265         *tokens = wb_tokens;
266 }
267