Did away with lprintf all together now its called CtdlLogPrintf()
[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
1 /*
2  * $Id$
3  *
4  * Default wordbreaker module for full text indexing.
5  *
6  */
7
8
9 #include "sysdep.h"
10 #include <stdlib.h>
11 #include <unistd.h>
12 #include <stdio.h>
13 #include <fcntl.h>
14 #include <signal.h>
15 #include <pwd.h>
16 #include <errno.h>
17 #include <sys/types.h>
18
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
21 # include <time.h>
22 #else
23 # if HAVE_SYS_TIME_H
24 #  include <sys/time.h>
25 # else
26 #  include <time.h>
27 # endif
28 #endif
29
30 #include <sys/wait.h>
31 #include <ctype.h>
32 #include <string.h>
33 #include <limits.h>
34 #include <libcitadel.h>
35 #include "citadel.h"
36 #include "server.h"
37 #include "sysdep_decls.h"
38 #include "citserver.h"
39 #include "support.h"
40 #include "config.h"
41 #include "database.h"
42 #include "msgbase.h"
43 #include "control.h"
44 #include "ft_wordbreaker.h"
45 #include "crc16.h"
46 #include "ctdl_module.h"
47
48 /*
49  * Noise words are not included in search indices.
50  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
51  * must also be changed, so that the index is rebuilt.
52  */
53 static char *noise_words[] = {
54         "about",
55         "after",
56         "also",
57         "another",
58         "because",
59         "been",
60         "before",
61         "being",
62         "between",
63         "both",
64         "came",
65         "come",
66         "could",
67         "each",
68         "from",
69         "have",
70         "here",
71         "himself",
72         "into",
73         "like",
74         "make",
75         "many",
76         "might",
77         "more",
78         "most",
79         "much",
80         "must",
81         "never",
82         "only",
83         "other",
84         "over",
85         "said",
86         "same",
87         "should",
88         "since",
89         "some",
90         "still",
91         "such",
92         "take",
93         "than",
94         "that",
95         "their",
96         "them",
97         "then",
98         "there",
99         "these",
100         "they",
101         "this",
102         "those",
103         "through",
104         "under",
105         "very",
106         "well",
107         "were",
108         "what",
109         "where",
110         "which",
111         "while",
112         "with",
113         "would",
114         "your"
115 };
116
117 /*
118  * Compare function
119  */
120 int intcmp(const void *rec1, const void *rec2) {
121         int i1, i2;
122
123         i1 = *(const int *)rec1;
124         i2 = *(const int *)rec2;
125
126         if (i1 > i2) return(1);
127         if (i1 < i2) return(-1);
128         return(0);
129 }
130
131
132 void wordbreaker(char *text, int *num_tokens, int **tokens) {
133
134         int wb_num_tokens = 0;
135         int wb_num_alloc = 0;
136         int *wb_tokens = NULL;
137
138         char *ptr;
139         char *word_start;
140         char *word_end;
141         char ch;
142         int word_len;
143         char word[256];
144         int i;
145         int word_crc;
146
147         if (text == NULL) {             /* no NULL text please */
148                 *num_tokens = 0;
149                 *tokens = NULL;
150                 return;
151         }
152
153         if (text[0] == 0) {             /* no empty text either */
154                 *num_tokens = 0;
155                 *tokens = NULL;
156                 return;
157         }
158
159         ptr = text;
160         word_start = NULL;
161         while (*ptr) {
162                 ch = *ptr;
163                 if (isalnum(ch)) {
164                         if (!word_start) {
165                                 word_start = ptr;
166                         }
167                 }
168                 ++ptr;
169                 ch = *ptr;
170                 if ( (!isalnum(ch)) && (word_start) ) {
171                         word_end = ptr;
172 //                      --word_end;
173
174                         /* extract the word */
175                         word_len = word_end - word_start;
176                         if (word_len >= sizeof word) {
177                                 CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
178                                 safestrncpy(word, word_start, sizeof word);
179                                 word[(sizeof word) - 1] = 0;
180                         }
181                         else {
182                                 safestrncpy(word, word_start, word_len+1);
183                                 word[word_len] = 0;
184                         }
185                         word_start = NULL;
186
187                         /* are we ok with the length? */
188                         if ( (word_len >= WB_MIN)
189                            && (word_len <= WB_MAX) ) {
190                                 for (i=0; i<word_len; ++i) {
191                                         word[i] = tolower(word[i]);
192                                 }
193                                 /* disqualify noise words */
194                                 for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
195                                         if (!strcmp(word, noise_words[i])) {
196                                                 word_len = 0;
197                                                 break;
198                                         }
199                                 }
200                                 if (word_len == 0)
201                                         continue;
202
203                                 word_crc = (int)
204                                         CalcCRC16Bytes(word_len, word);
205
206                                 ++wb_num_tokens;
207                                 if (wb_num_tokens > wb_num_alloc) {
208                                         wb_num_alloc += 512;
209                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
210                                 }
211                                 wb_tokens[wb_num_tokens - 1] = word_crc;
212                         }
213                 }
214         }
215
216         /* sort and purge dups */
217         if (wb_num_tokens > 1) {
218                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
219                 for (i=0; i<(wb_num_tokens-1); ++i) {
220                         if (wb_tokens[i] == wb_tokens[i+1]) {
221                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
222                                         ((wb_num_tokens - i - 1)*sizeof(int)));
223                                 --wb_num_tokens;
224                                 --i;
225                         }
226                 }
227         }
228
229         *num_tokens = wb_num_tokens;
230         *tokens = wb_tokens;
231 }
232