HUGE PATCH. This moves all of mime_parser.c and all
[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
1 /*
2  * $Id$
3  *
4  * Default wordbreaker module for full text indexing.
5  *
6  */
7
8
9 #include "sysdep.h"
10 #include <stdlib.h>
11 #include <unistd.h>
12 #include <stdio.h>
13 #include <fcntl.h>
14 #include <signal.h>
15 #include <pwd.h>
16 #include <errno.h>
17 #include <sys/types.h>
18
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
21 # include <time.h>
22 #else
23 # if HAVE_SYS_TIME_H
24 #  include <sys/time.h>
25 # else
26 #  include <time.h>
27 # endif
28 #endif
29
30 #include <sys/wait.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <limits.h>
34 #include <libcitadel.h>
35 #include "citadel.h"
36 #include "server.h"
37 #include "sysdep_decls.h"
38 #include "citserver.h"
39 #include "support.h"
40 #include "config.h"
41 #include "database.h"
42 #include "msgbase.h"
43 #include "control.h"
44 #include "ft_wordbreaker.h"
45 #include "crc16.h"
46
47 /*
48  * Noise words are not included in search indices.
49  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
50  * must also be changed, so that the index is rebuilt.
51  */
52 static char *noise_words[] = {
53         "about",
54         "after",
55         "also",
56         "another",
57         "because",
58         "been",
59         "before",
60         "being",
61         "between",
62         "both",
63         "came",
64         "come",
65         "could",
66         "each",
67         "from",
68         "have",
69         "here",
70         "himself",
71         "into",
72         "like",
73         "make",
74         "many",
75         "might",
76         "more",
77         "most",
78         "much",
79         "must",
80         "never",
81         "only",
82         "other",
83         "over",
84         "said",
85         "same",
86         "should",
87         "since",
88         "some",
89         "still",
90         "such",
91         "take",
92         "than",
93         "that",
94         "their",
95         "them",
96         "then",
97         "there",
98         "these",
99         "they",
100         "this",
101         "those",
102         "through",
103         "under",
104         "very",
105         "well",
106         "were",
107         "what",
108         "where",
109         "which",
110         "while",
111         "with",
112         "would",
113         "your"
114 };
115
116 /*
117  * Compare function
118  */
119 int intcmp(const void *rec1, const void *rec2) {
120         int i1, i2;
121
122         i1 = *(const int *)rec1;
123         i2 = *(const int *)rec2;
124
125         if (i1 > i2) return(1);
126         if (i1 < i2) return(-1);
127         return(0);
128 }
129
130
131 void wordbreaker(char *text, int *num_tokens, int **tokens) {
132
133         int wb_num_tokens = 0;
134         int wb_num_alloc = 0;
135         int *wb_tokens = NULL;
136
137         char *ptr;
138         char *word_start;
139         char *word_end;
140         char ch;
141         int word_len;
142         char word[256];
143         int i;
144         int word_crc;
145
146         if (text == NULL) {             /* no NULL text please */
147                 *num_tokens = 0;
148                 *tokens = NULL;
149                 return;
150         }
151
152         if (text[0] == 0) {             /* no empty text either */
153                 *num_tokens = 0;
154                 *tokens = NULL;
155                 return;
156         }
157
158         ptr = text;
159         word_start = NULL;
160         while (*ptr) {
161                 ch = *ptr;
162                 if (isalnum(ch)) {
163                         if (!word_start) {
164                                 word_start = ptr;
165                         }
166                 }
167                 ++ptr;
168                 ch = *ptr;
169                 if ( (!isalnum(ch)) && (word_start) ) {
170                         word_end = ptr;
171 //                      --word_end;
172
173                         /* extract the word */
174                         word_len = word_end - word_start;
175                         if (word_len >= sizeof word) {
176                                 lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
177                                 safestrncpy(word, word_start, sizeof word);
178                                 word[(sizeof word) - 1] = 0;
179                         }
180                         else {
181                                 safestrncpy(word, word_start, word_len+1);
182                                 word[word_len] = 0;
183                         }
184                         word_start = NULL;
185
186                         /* are we ok with the length? */
187                         if ( (word_len >= WB_MIN)
188                            && (word_len <= WB_MAX) ) {
189                                 for (i=0; i<word_len; ++i) {
190                                         word[i] = tolower(word[i]);
191                                 }
192                                 /* disqualify noise words */
193                                 for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
194                                         if (!strcmp(word, noise_words[i])) {
195                                                 word_len = 0;
196                                                 break;
197                                         }
198                                 }
199                                 if (word_len == 0)
200                                         continue;
201
202                                 word_crc = (int)
203                                         CalcCRC16Bytes(word_len, word);
204
205                                 ++wb_num_tokens;
206                                 if (wb_num_tokens > wb_num_alloc) {
207                                         wb_num_alloc += 512;
208                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
209                                 }
210                                 wb_tokens[wb_num_tokens - 1] = word_crc;
211                         }
212                 }
213         }
214
215         /* sort and purge dups */
216         if (wb_num_tokens > 1) {
217                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
218                 for (i=0; i<(wb_num_tokens-1); ++i) {
219                         if (wb_tokens[i] == wb_tokens[i+1]) {
220                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
221                                         ((wb_num_tokens - i - 1)*sizeof(int)));
222                                 --wb_num_tokens;
223                                 --i;
224                         }
225                 }
226         }
227
228         *num_tokens = wb_num_tokens;
229         *tokens = wb_tokens;
230 }
231