d8a11e55cfd437ddfab4a104a13609d3862190bf
[citadel.git] / citadel / server / modules / fulltext / ft_wordbreaker.c
1 /*
2  * Default wordbreaker module for full text indexing.
3  *
4  * Copyright (c) 2005-2017 by the citadel.org team
5  *
6  * This program is open source software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 3.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  */
14
15 #include "../../sysdep.h"
16 #include <stdlib.h>
17 #include <unistd.h>
18 #include <stdio.h>
19 #include <fcntl.h>
20 #include <signal.h>
21 #include <pwd.h>
22 #include <errno.h>
23 #include <sys/types.h>
24 #include <time.h>
25 #include <sys/wait.h>
26 #include <ctype.h>
27 #include <string.h>
28 #include <limits.h>
29 #include <libcitadel.h>
30 #include "../../citadel_defs.h"
31 #include "../../server.h"
32 #include "../../sysdep_decls.h"
33 #include "../../citserver.h"
34 #include "../../support.h"
35 #include "../../config.h"
36 #include "../../database.h"
37 #include "../../msgbase.h"
38 #include "../../control.h"
39 #include "ft_wordbreaker.h"
40 #include "crc16.h"
41 #include "../../ctdl_module.h"
42
43 /*
44  * Noise words are not included in search indices.
45  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
46  * must also be changed, so that the index is rebuilt.
47  */
48 static char *noise_words[] = {
49         "about",
50         "after",
51         "also",
52         "another",
53         "because",
54         "been",
55         "before",
56         "being",
57         "between",
58         "both",
59         "came",
60         "come",
61         "could",
62         "each",
63         "from",
64         "have",
65         "here",
66         "himself",
67         "into",
68         "like",
69         "make",
70         "many",
71         "might",
72         "more",
73         "most",
74         "much",
75         "must",
76         "never",
77         "only",
78         "other",
79         "over",
80         "said",
81         "same",
82         "should",
83         "since",
84         "some",
85         "still",
86         "such",
87         "take",
88         "than",
89         "that",
90         "their",
91         "them",
92         "then",
93         "there",
94         "these",
95         "they",
96         "this",
97         "those",
98         "through",
99         "under",
100         "very",
101         "well",
102         "were",
103         "what",
104         "where",
105         "which",
106         "while",
107         "with",
108         "would",
109         "your"
110 };
111 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
112
113
114 /*
115  * Compare function
116  */
117 int intcmp(const void *rec1, const void *rec2) {
118         int i1, i2;
119
120         i1 = *(const int *)rec1;
121         i2 = *(const int *)rec2;
122
123         if (i1 > i2) return(1);
124         if (i1 < i2) return(-1);
125         return(0);
126 }
127
128
129 Array *wordbreaker(const char *text) {
130         const char *ptr;
131         const char *word_start;
132         const char *word_end;
133         char ch;
134         int word_len;
135         char word[256];
136         int i;
137         int word_crc;
138
139         if (text == NULL) {             /* no NULL text please */
140                 return(NULL);
141         }
142
143         if (text[0] == 0) {             /* no empty text either */
144                 return(NULL);
145         }
146
147         Array *found_tokens = array_new(sizeof(int));
148         if (found_tokens == NULL) {
149                 return(NULL);
150         }
151
152         ptr = text;
153         word_start = NULL;
154         while (*ptr) {
155                 ch = *ptr;
156                 if (isalnum(ch)) {
157                         if (!word_start) {
158                                 word_start = ptr;
159                         }
160                 }
161                 ++ptr;
162                 ch = *ptr;
163                 if ( (!isalnum(ch)) && (word_start) ) {
164                         word_end = ptr;
165
166                         /* extract the word */
167                         word_len = word_end - word_start;
168                         if (word_len >= sizeof word) {
169                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
170                                 safestrncpy(word, word_start, sizeof word);
171                                 word[(sizeof word) - 1] = 0;
172                         }
173                         else {
174                                 safestrncpy(word, word_start, word_len+1);
175                                 word[word_len] = 0;
176                         }
177                         word_start = NULL;
178
179                         /* are we ok with the length? */
180                         if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
181                                 for (i=0; i<word_len; ++i) {
182                                         word[i] = tolower(word[i]);
183                                 }
184                                 /* disqualify noise words */
185                                 for (i=0; i<NUM_NOISE; ++i) {
186                                         if (!strcasecmp(word, noise_words[i])) {
187                                                 word_len = 0;
188                                                 break;
189                                         }
190                                 }
191                                 /* FIXME make this case insensitive */
192                                 /* add it to the array */
193                                 if (word_len > 0) {
194                                         word_crc = (int) CalcCRC16Bytes(word_len, word);
195                                         array_append(found_tokens, &word_crc);
196                                 }
197                         }
198                 }
199         }
200
201         /* sort and purge dups */
202         if (array_len(found_tokens) > 1) {
203                 array_sort(found_tokens, intcmp);
204                 for (i=0; i<(array_len(found_tokens)); ++i) {
205                         if (array_get_element_at(found_tokens, i) == array_get_element_at(found_tokens, i+1)) {
206                                 array_delete_element_at(found_tokens, i);
207                                 --i;
208                         }
209                 }
210         }
211         return(found_tokens);
212 }
213