More removal of $Id$ tags
[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
1 /*
2  * Default wordbreaker module for full text indexing.
3  *
4  * Copyright (c) 2005-2009 by the citadel.org team
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 3 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, write to the Free Software
18  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  */
20
21
22 #include "sysdep.h"
23 #include <stdlib.h>
24 #include <unistd.h>
25 #include <stdio.h>
26 #include <fcntl.h>
27 #include <signal.h>
28 #include <pwd.h>
29 #include <errno.h>
30 #include <sys/types.h>
31
32 #if TIME_WITH_SYS_TIME
33 # include <sys/time.h>
34 # include <time.h>
35 #else
36 # if HAVE_SYS_TIME_H
37 #  include <sys/time.h>
38 # else
39 #  include <time.h>
40 # endif
41 #endif
42
43 #include <sys/wait.h>
44 #include <ctype.h>
45 #include <string.h>
46 #include <limits.h>
47 #include <libcitadel.h>
48 #include "citadel.h"
49 #include "server.h"
50 #include "sysdep_decls.h"
51 #include "citserver.h"
52 #include "support.h"
53 #include "config.h"
54 #include "database.h"
55 #include "msgbase.h"
56 #include "control.h"
57 #include "ft_wordbreaker.h"
58 #include "crc16.h"
59 #include "ctdl_module.h"
60
61 /*
62  * Noise words are not included in search indices.
63  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
64  * must also be changed, so that the index is rebuilt.
65  */
66
67 noise_word *noise_words[26];
68
69 static char *noise_words_init[] = {
70         "about",
71         "after",
72         "also",
73         "another",
74         "because",
75         "been",
76         "before",
77         "being",
78         "between",
79         "both",
80         "came",
81         "come",
82         "could",
83         "each",
84         "from",
85         "have",
86         "here",
87         "himself",
88         "into",
89         "like",
90         "make",
91         "many",
92         "might",
93         "more",
94         "most",
95         "much",
96         "must",
97         "never",
98         "only",
99         "other",
100         "over",
101         "said",
102         "same",
103         "should",
104         "since",
105         "some",
106         "still",
107         "such",
108         "take",
109         "than",
110         "that",
111         "their",
112         "them",
113         "then",
114         "there",
115         "these",
116         "they",
117         "this",
118         "those",
119         "through",
120         "under",
121         "very",
122         "well",
123         "were",
124         "what",
125         "where",
126         "which",
127         "while",
128         "with",
129         "would",
130         "your"
131 };
132
133
134 void initialize_noise_words(void)
135 {
136         int i;
137         int len;
138         int ch;
139         noise_word *next;
140         
141         memset (noise_words, 0, sizeof(noise_words));
142         
143         for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
144         {
145                 ch = noise_words_init[i][0] - 'a';
146                 len = strlen(noise_words_init[i]);
147                 
148                 next = malloc(sizeof(noise_word));
149                 next->len = len;
150                 next->word = strdup(noise_words_init[i]);
151                 next->next = noise_words[ch];
152                 noise_words[ch] = next;
153         }
154 }
155
156
157 void noise_word_cleanup(void)
158 {
159         int i;
160         noise_word *cur, *next;
161         
162         CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
163         
164         for (i = 0 ; i < 26 ; i++)
165         {
166                 cur = noise_words[i];
167                 while (cur)
168                 {
169                         next = cur->next;
170                         free(cur->word);
171                         free(cur);
172                         cur = next;
173                 }
174         }
175 }
176
177 /*
178  * Compare function
179  */
180 int intcmp(const void *rec1, const void *rec2) {
181         int i1, i2;
182
183         i1 = *(const int *)rec1;
184         i2 = *(const int *)rec2;
185
186         if (i1 > i2) return(1);
187         if (i1 < i2) return(-1);
188         return(0);
189 }
190
191
192 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
193
194         int wb_num_tokens = 0;
195         int wb_num_alloc = 0;
196         int *wb_tokens = NULL;
197
198         const char *ptr;
199         const char *word_start;
200         const char *word_end;
201         char ch;
202         int word_len;
203         char word[256];
204         int i;
205         int word_crc;
206         noise_word *noise;
207         
208         
209         if (text == NULL) {             /* no NULL text please */
210                 *num_tokens = 0;
211                 *tokens = NULL;
212                 return;
213         }
214
215         if (text[0] == 0) {             /* no empty text either */
216                 *num_tokens = 0;
217                 *tokens = NULL;
218                 return;
219         }
220
221         ptr = text;
222         word_start = NULL;
223         while (*ptr) {
224                 ch = *ptr;
225                 if (isalnum(ch)) {
226                         if (!word_start) {
227                                 word_start = ptr;
228                         }
229                 }
230                 ++ptr;
231                 ch = *ptr;
232                 if ( (!isalnum(ch)) && (word_start) ) {
233                         word_end = ptr;
234 //                      --word_end;
235
236                         /* extract the word */
237                         word_len = word_end - word_start;
238                         if (word_len >= sizeof word) {
239                                 CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
240                                 safestrncpy(word, word_start, sizeof word);
241                                 word[(sizeof word) - 1] = 0;
242                         }
243                         else {
244                                 safestrncpy(word, word_start, word_len+1);
245                                 word[word_len] = 0;
246                         }
247                         word_start = NULL;
248
249                         /* are we ok with the length? */
250                         if ( (word_len >= WB_MIN)
251                            && (word_len <= WB_MAX) ) {
252                                 for (i=0; i<word_len; ++i) {
253                                         word[i] = tolower(word[i]);
254                                 }
255                                 /* disqualify noise words */
256                                 noise = noise_words[(int) (word[0]-'a')];
257                                 while (noise)
258                                 {
259                                         if (noise->len == word_len)
260                                         {
261                                                 if (!strcmp(word, noise->word)) 
262                                                 {
263                                                         word_len = 0;
264                                                         break;
265                                                 }
266                                         }
267                                         noise = noise->next;
268                                 }
269                                 if (word_len == 0)
270                                         continue;
271
272                                 word_crc = (int)
273                                         CalcCRC16Bytes(word_len, word);
274
275                                 ++wb_num_tokens;
276                                 if (wb_num_tokens > wb_num_alloc) {
277                                         wb_num_alloc += 512;
278                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
279                                 }
280                                 wb_tokens[wb_num_tokens - 1] = word_crc;
281                         }
282                 }
283         }
284
285         /* sort and purge dups */
286         if (wb_num_tokens > 1) {
287                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
288                 for (i=0; i<(wb_num_tokens-1); ++i) {
289                         if (wb_tokens[i] == wb_tokens[i+1]) {
290                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
291                                         ((wb_num_tokens - i - 1)*sizeof(int)));
292                                 --wb_num_tokens;
293                                 --i;
294                         }
295                 }
296         }
297
298         *num_tokens = wb_num_tokens;
299         *tokens = wb_tokens;
300 }
301