No more cleanup hooks. The OS can reclaim memory better than we can. We want to...
[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
1 /*
2  * Default wordbreaker module for full text indexing.
3  *
4  * Copyright (c) 2005-2017 by the citadel.org team
5  *
6  * This program is open source software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 3.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  */
14
15 #include "sysdep.h"
16 #include <stdlib.h>
17 #include <unistd.h>
18 #include <stdio.h>
19 #include <fcntl.h>
20 #include <signal.h>
21 #include <pwd.h>
22 #include <errno.h>
23 #include <sys/types.h>
24
25 #if TIME_WITH_SYS_TIME
26 # include <sys/time.h>
27 # include <time.h>
28 #else
29 # if HAVE_SYS_TIME_H
30 #  include <sys/time.h>
31 # else
32 #  include <time.h>
33 # endif
34 #endif
35
36 #include <sys/wait.h>
37 #include <ctype.h>
38 #include <string.h>
39 #include <limits.h>
40 #include <libcitadel.h>
41 #include "citadel.h"
42 #include "server.h"
43 #include "sysdep_decls.h"
44 #include "citserver.h"
45 #include "support.h"
46 #include "config.h"
47 #include "database.h"
48 #include "msgbase.h"
49 #include "control.h"
50 #include "ft_wordbreaker.h"
51 #include "crc16.h"
52 #include "ctdl_module.h"
53
54 /*
55  * Noise words are not included in search indices.
56  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
57  * must also be changed, so that the index is rebuilt.
58  */
59
60 noise_word *noise_words[26];
61
62 static char *noise_words_init[] = {
63         "about",
64         "after",
65         "also",
66         "another",
67         "because",
68         "been",
69         "before",
70         "being",
71         "between",
72         "both",
73         "came",
74         "come",
75         "could",
76         "each",
77         "from",
78         "have",
79         "here",
80         "himself",
81         "into",
82         "like",
83         "make",
84         "many",
85         "might",
86         "more",
87         "most",
88         "much",
89         "must",
90         "never",
91         "only",
92         "other",
93         "over",
94         "said",
95         "same",
96         "should",
97         "since",
98         "some",
99         "still",
100         "such",
101         "take",
102         "than",
103         "that",
104         "their",
105         "them",
106         "then",
107         "there",
108         "these",
109         "they",
110         "this",
111         "those",
112         "through",
113         "under",
114         "very",
115         "well",
116         "were",
117         "what",
118         "where",
119         "which",
120         "while",
121         "with",
122         "would",
123         "your"
124 };
125
126
127 void initialize_noise_words(void)
128 {
129         int i;
130         int len;
131         int ch;
132         noise_word *next;
133         
134         memset (noise_words, 0, sizeof(noise_words));
135         
136         for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
137         {
138                 ch = noise_words_init[i][0] - 'a';
139                 len = strlen(noise_words_init[i]);
140                 
141                 next = malloc(sizeof(noise_word));
142                 next->len = len;
143                 next->word = strdup(noise_words_init[i]);
144                 next->next = noise_words[ch];
145                 noise_words[ch] = next;
146         }
147 }
148
149
150 /*
151  * Compare function
152  */
153 int intcmp(const void *rec1, const void *rec2) {
154         int i1, i2;
155
156         i1 = *(const int *)rec1;
157         i2 = *(const int *)rec2;
158
159         if (i1 > i2) return(1);
160         if (i1 < i2) return(-1);
161         return(0);
162 }
163
164
165 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
166
167         int wb_num_tokens = 0;
168         int wb_num_alloc = 0;
169         int *wb_tokens = NULL;
170
171         const char *ptr;
172         const char *word_start;
173         const char *word_end;
174         char ch;
175         int word_len;
176         char word[256];
177         int i;
178         int word_crc;
179         noise_word *noise;
180         
181         
182         if (text == NULL) {             /* no NULL text please */
183                 *num_tokens = 0;
184                 *tokens = NULL;
185                 return;
186         }
187
188         if (text[0] == 0) {             /* no empty text either */
189                 *num_tokens = 0;
190                 *tokens = NULL;
191                 return;
192         }
193
194         ptr = text;
195         word_start = NULL;
196         while (*ptr) {
197                 ch = *ptr;
198                 if (isalnum(ch)) {
199                         if (!word_start) {
200                                 word_start = ptr;
201                         }
202                 }
203                 ++ptr;
204                 ch = *ptr;
205                 if ( (!isalnum(ch)) && (word_start) ) {
206                         word_end = ptr;
207
208                         /* extract the word */
209                         word_len = word_end - word_start;
210                         if (word_len >= sizeof word) {
211                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
212                                 safestrncpy(word, word_start, sizeof word);
213                                 word[(sizeof word) - 1] = 0;
214                         }
215                         else {
216                                 safestrncpy(word, word_start, word_len+1);
217                                 word[word_len] = 0;
218                         }
219                         word_start = NULL;
220
221                         /* are we ok with the length? */
222                         if ( (word_len >= WB_MIN)
223                            && (word_len <= WB_MAX) ) {
224                                 for (i=0; i<word_len; ++i) {
225                                         word[i] = tolower(word[i]);
226                                 }
227                                 /* disqualify noise words */
228                                 noise = noise_words[(int) (word[0]-'a')];
229                                 while (noise)
230                                 {
231                                         if (noise->len == word_len)
232                                         {
233                                                 if (!strcmp(word, noise->word)) 
234                                                 {
235                                                         word_len = 0;
236                                                         break;
237                                                 }
238                                         }
239                                         noise = noise->next;
240                                 }
241                                 if (word_len == 0)
242                                         continue;
243
244                                 word_crc = (int)
245                                         CalcCRC16Bytes(word_len, word);
246
247                                 ++wb_num_tokens;
248                                 if (wb_num_tokens > wb_num_alloc) {
249                                         wb_num_alloc += 512;
250                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
251                                 }
252                                 wb_tokens[wb_num_tokens - 1] = word_crc;
253                         }
254                 }
255         }
256
257         /* sort and purge dups */
258         if (wb_num_tokens > 1) {
259                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
260                 for (i=0; i<(wb_num_tokens-1); ++i) {
261                         if (wb_tokens[i] == wb_tokens[i+1]) {
262                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
263                                         ((wb_num_tokens - i - 1)*sizeof(int)));
264                                 --wb_num_tokens;
265                                 --i;
266                         }
267                 }
268         }
269
270         *num_tokens = wb_num_tokens;
271         *tokens = wb_tokens;
272 }
273