* More license declarations
[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
1 /*
2  * $Id$
3  *
4  * Default wordbreaker module for full text indexing.
5  *
6  * Copyright (c) 2005-2009 by the citadel.org team
7  *
8  *  This program is free software; you can redistribute it and/or modify
9  *  it under the terms of the GNU General Public License as published by
10  *  the Free Software Foundation; either version 3 of the License, or
11  *  (at your option) any later version.
12  *
13  *  This program is distributed in the hope that it will be useful,
14  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *  GNU General Public License for more details.
17  *
18  *  You should have received a copy of the GNU General Public License
19  *  along with this program; if not, write to the Free Software
20  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
21  */
22
23
24 #include "sysdep.h"
25 #include <stdlib.h>
26 #include <unistd.h>
27 #include <stdio.h>
28 #include <fcntl.h>
29 #include <signal.h>
30 #include <pwd.h>
31 #include <errno.h>
32 #include <sys/types.h>
33
34 #if TIME_WITH_SYS_TIME
35 # include <sys/time.h>
36 # include <time.h>
37 #else
38 # if HAVE_SYS_TIME_H
39 #  include <sys/time.h>
40 # else
41 #  include <time.h>
42 # endif
43 #endif
44
45 #include <sys/wait.h>
46 #include <ctype.h>
47 #include <string.h>
48 #include <limits.h>
49 #include <libcitadel.h>
50 #include "citadel.h"
51 #include "server.h"
52 #include "sysdep_decls.h"
53 #include "citserver.h"
54 #include "support.h"
55 #include "config.h"
56 #include "database.h"
57 #include "msgbase.h"
58 #include "control.h"
59 #include "ft_wordbreaker.h"
60 #include "crc16.h"
61 #include "ctdl_module.h"
62
63 /*
64  * Noise words are not included in search indices.
65  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
66  * must also be changed, so that the index is rebuilt.
67  */
68
69 noise_word *noise_words[26];
70
71 static char *noise_words_init[] = {
72         "about",
73         "after",
74         "also",
75         "another",
76         "because",
77         "been",
78         "before",
79         "being",
80         "between",
81         "both",
82         "came",
83         "come",
84         "could",
85         "each",
86         "from",
87         "have",
88         "here",
89         "himself",
90         "into",
91         "like",
92         "make",
93         "many",
94         "might",
95         "more",
96         "most",
97         "much",
98         "must",
99         "never",
100         "only",
101         "other",
102         "over",
103         "said",
104         "same",
105         "should",
106         "since",
107         "some",
108         "still",
109         "such",
110         "take",
111         "than",
112         "that",
113         "their",
114         "them",
115         "then",
116         "there",
117         "these",
118         "they",
119         "this",
120         "those",
121         "through",
122         "under",
123         "very",
124         "well",
125         "were",
126         "what",
127         "where",
128         "which",
129         "while",
130         "with",
131         "would",
132         "your"
133 };
134
135
136 void initialize_noise_words(void)
137 {
138         int i;
139         int len;
140         int ch;
141         noise_word *next;
142         
143         memset (noise_words, 0, sizeof(noise_words));
144         
145         for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
146         {
147                 ch = noise_words_init[i][0] - 'a';
148                 len = strlen(noise_words_init[i]);
149                 
150                 next = malloc(sizeof(noise_word));
151                 next->len = len;
152                 next->word = strdup(noise_words_init[i]);
153                 next->next = noise_words[ch];
154                 noise_words[ch] = next;
155         }
156 }
157
158
159 void noise_word_cleanup(void)
160 {
161         int i;
162         noise_word *cur, *next;
163         
164         CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
165         
166         for (i = 0 ; i < 26 ; i++)
167         {
168                 cur = noise_words[i];
169                 while (cur)
170                 {
171                         next = cur->next;
172                         free(cur->word);
173                         free(cur);
174                         cur = next;
175                 }
176         }
177 }
178
179 /*
180  * Compare function
181  */
182 int intcmp(const void *rec1, const void *rec2) {
183         int i1, i2;
184
185         i1 = *(const int *)rec1;
186         i2 = *(const int *)rec2;
187
188         if (i1 > i2) return(1);
189         if (i1 < i2) return(-1);
190         return(0);
191 }
192
193
194 void wordbreaker(char *text, int *num_tokens, int **tokens) {
195
196         int wb_num_tokens = 0;
197         int wb_num_alloc = 0;
198         int *wb_tokens = NULL;
199
200         char *ptr;
201         char *word_start;
202         char *word_end;
203         char ch;
204         int word_len;
205         char word[256];
206         int i;
207         int word_crc;
208         noise_word *noise;
209         
210         
211         if (text == NULL) {             /* no NULL text please */
212                 *num_tokens = 0;
213                 *tokens = NULL;
214                 return;
215         }
216
217         if (text[0] == 0) {             /* no empty text either */
218                 *num_tokens = 0;
219                 *tokens = NULL;
220                 return;
221         }
222
223         ptr = text;
224         word_start = NULL;
225         while (*ptr) {
226                 ch = *ptr;
227                 if (isalnum(ch)) {
228                         if (!word_start) {
229                                 word_start = ptr;
230                         }
231                 }
232                 ++ptr;
233                 ch = *ptr;
234                 if ( (!isalnum(ch)) && (word_start) ) {
235                         word_end = ptr;
236 //                      --word_end;
237
238                         /* extract the word */
239                         word_len = word_end - word_start;
240                         if (word_len >= sizeof word) {
241                                 CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
242                                 safestrncpy(word, word_start, sizeof word);
243                                 word[(sizeof word) - 1] = 0;
244                         }
245                         else {
246                                 safestrncpy(word, word_start, word_len+1);
247                                 word[word_len] = 0;
248                         }
249                         word_start = NULL;
250
251                         /* are we ok with the length? */
252                         if ( (word_len >= WB_MIN)
253                            && (word_len <= WB_MAX) ) {
254                                 for (i=0; i<word_len; ++i) {
255                                         word[i] = tolower(word[i]);
256                                 }
257                                 /* disqualify noise words */
258                                 noise = noise_words[(int) (word[0]-'a')];
259                                 while (noise)
260                                 {
261                                         if (noise->len == word_len)
262                                         {
263                                                 if (!strcmp(word, noise->word)) 
264                                                 {
265                                                         word_len = 0;
266                                                         break;
267                                                 }
268                                         }
269                                         noise = noise->next;
270                                 }
271                                 if (word_len == 0)
272                                         continue;
273
274                                 word_crc = (int)
275                                         CalcCRC16Bytes(word_len, word);
276
277                                 ++wb_num_tokens;
278                                 if (wb_num_tokens > wb_num_alloc) {
279                                         wb_num_alloc += 512;
280                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
281                                 }
282                                 wb_tokens[wb_num_tokens - 1] = word_crc;
283                         }
284                 }
285         }
286
287         /* sort and purge dups */
288         if (wb_num_tokens > 1) {
289                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
290                 for (i=0; i<(wb_num_tokens-1); ++i) {
291                         if (wb_tokens[i] == wb_tokens[i+1]) {
292                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
293                                         ((wb_num_tokens - i - 1)*sizeof(int)));
294                                 --wb_num_tokens;
295                                 --i;
296                         }
297                 }
298         }
299
300         *num_tokens = wb_num_tokens;
301         *tokens = wb_tokens;
302 }
303