stable now but there are GIANT PIECES MISSING
[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
1 /*
2  * Default wordbreaker module for full text indexing.
3  *
4  * Copyright (c) 2005-2017 by the citadel.org team
5  *
6  * This program is open source software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 3.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  */
14
15 #include "sysdep.h"
16 #include <stdlib.h>
17 #include <unistd.h>
18 #include <stdio.h>
19 #include <fcntl.h>
20 #include <signal.h>
21 #include <pwd.h>
22 #include <errno.h>
23 #include <sys/types.h>
24 #include <time.h>
25 #include <sys/wait.h>
26 #include <ctype.h>
27 #include <string.h>
28 #include <limits.h>
29 #include <libcitadel.h>
30 #include "citadel.h"
31 #include "server.h"
32 #include "sysdep_decls.h"
33 #include "citserver.h"
34 #include "support.h"
35 #include "config.h"
36 #include "database.h"
37 #include "msgbase.h"
38 #include "control.h"
39 #include "ft_wordbreaker.h"
40 #include "crc16.h"
41 #include "ctdl_module.h"
42
43 /*
44  * Noise words are not included in search indices.
45  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
46  * must also be changed, so that the index is rebuilt.
47  */
48 static char *noise_words[] = {
49         "about",
50         "after",
51         "also",
52         "another",
53         "because",
54         "been",
55         "before",
56         "being",
57         "between",
58         "both",
59         "came",
60         "come",
61         "could",
62         "each",
63         "from",
64         "have",
65         "here",
66         "himself",
67         "into",
68         "like",
69         "make",
70         "many",
71         "might",
72         "more",
73         "most",
74         "much",
75         "must",
76         "never",
77         "only",
78         "other",
79         "over",
80         "said",
81         "same",
82         "should",
83         "since",
84         "some",
85         "still",
86         "such",
87         "take",
88         "than",
89         "that",
90         "their",
91         "them",
92         "then",
93         "there",
94         "these",
95         "they",
96         "this",
97         "those",
98         "through",
99         "under",
100         "very",
101         "well",
102         "were",
103         "what",
104         "where",
105         "which",
106         "while",
107         "with",
108         "would",
109         "your"
110 };
111 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
112
113
114 /*
115  * Compare function
116  */
117 int intcmp(const void *rec1, const void *rec2) {
118         int i1, i2;
119
120         i1 = *(const int *)rec1;
121         i2 = *(const int *)rec2;
122
123         if (i1 > i2) return(1);
124         if (i1 < i2) return(-1);
125         return(0);
126 }
127
128
129 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
130
131         int wb_num_tokens = 0;
132         int wb_num_alloc = 0;
133         int *wb_tokens = NULL;
134
135         const char *ptr;
136         const char *word_start;
137         const char *word_end;
138         char ch;
139         int word_len;
140         char word[256];
141         int i;
142         int word_crc;
143         
144         if (text == NULL) {             /* no NULL text please */
145                 *num_tokens = 0;
146                 *tokens = NULL;
147                 return;
148         }
149
150         if (text[0] == 0) {             /* no empty text either */
151                 *num_tokens = 0;
152                 *tokens = NULL;
153                 return;
154         }
155
156         ptr = text;
157         word_start = NULL;
158         while (*ptr) {
159                 ch = *ptr;
160                 if (isalnum(ch)) {
161                         if (!word_start) {
162                                 word_start = ptr;
163                         }
164                 }
165                 ++ptr;
166                 ch = *ptr;
167                 if ( (!isalnum(ch)) && (word_start) ) {
168                         word_end = ptr;
169
170                         /* extract the word */
171                         word_len = word_end - word_start;
172                         if (word_len >= sizeof word) {
173                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
174                                 safestrncpy(word, word_start, sizeof word);
175                                 word[(sizeof word) - 1] = 0;
176                         }
177                         else {
178                                 safestrncpy(word, word_start, word_len+1);
179                                 word[word_len] = 0;
180                         }
181                         word_start = NULL;
182
183                         /* are we ok with the length? */
184                         if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
185                                 for (i=0; i<word_len; ++i) {
186                                         word[i] = tolower(word[i]);
187                                 }
188                                 /* disqualify noise words */
189                                 for (i=0; i<NUM_NOISE; ++i) {
190                                         if (!strcmp(word, noise_words[i])) {
191                                                 word_len = 0;
192                                                 break;
193                                         }
194                                 }
195
196                                 if (word_len == 0)
197                                         continue;
198
199                                 word_crc = (int) CalcCRC16Bytes(word_len, word);
200
201                                 ++wb_num_tokens;
202                                 if (wb_num_tokens > wb_num_alloc) {
203                                         wb_num_alloc += 512;
204                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
205                                 }
206                                 wb_tokens[wb_num_tokens - 1] = word_crc;
207                         }
208                 }
209         }
210
211         /* sort and purge dups */
212         if (wb_num_tokens > 1) {
213                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
214                 for (i=0; i<(wb_num_tokens-1); ++i) {
215                         if (wb_tokens[i] == wb_tokens[i+1]) {
216                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
217                                         ((wb_num_tokens - i - 1)*sizeof(int)));
218                                 --wb_num_tokens;
219                                 --i;
220                         }
221                 }
222         }
223
224         *num_tokens = wb_num_tokens;
225         *tokens = wb_tokens;
226 }
227