code.citadel.org Git - citadel.git/blob - citadel/server/modules/fulltext/ft

1 /*

2 * Default wordbreaker module for full text indexing.

3 *

5 *

6 * This program is open source software; you can redistribute it and/or modify

7 * it under the terms of the GNU General Public License version 3.

8 *

9 * This program is distributed in the hope that it will be useful,

10 * but WITHOUT ANY WARRANTY; without even the implied warranty of

11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

12 * GNU General Public License for more details.

13 */

15 #include "../../sysdep.h"

16 #include <stdlib.h>

17 #include <unistd.h>

18 #include <stdio.h>

19 #include <fcntl.h>

20 #include <signal.h>

21 #include <pwd.h>

22 #include <errno.h>

23 #include <sys/types.h>

24 #include <time.h>

25 #include <sys/wait.h>

26 #include <ctype.h>

27 #include <string.h>

28 #include <limits.h>

29 #include <libcitadel.h>

30 #include "../../citadel_defs.h"

31 #include "../../server.h"

32 #include "../../sysdep_decls.h"

33 #include "../../citserver.h"

34 #include "../../support.h"

35 #include "../../config.h"

36 #include "../../database.h"

37 #include "../../msgbase.h"

38 #include "../../control.h"

39 #include "ft_wordbreaker.h"

40 #include "crc16.h"

41 #include "../../ctdl_module.h"

43 /*

44 * Noise words are not included in search indices.

45 * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID

46 * must also be changed, so that the index is rebuilt.

47 */

48 static char *noise_words[] = {

49 "about",

50 "after",

51 "also",

52 "another",

53 "because",

54 "been",

55 "before",

56 "being",

57 "between",

58 "both",

59 "came",

60 "come",

61 "could",

62 "each",

63 "from",

64 "have",

65 "here",

66 "himself",

67 "into",

68 "like",

69 "make",

70 "many",

71 "might",

72 "more",

73 "most",

74 "much",

75 "must",

76 "never",

77 "only",

78 "other",

79 "over",

80 "said",

81 "same",

82 "should",

83 "since",

84 "some",

85 "still",

86 "such",

87 "take",

88 "than",

89 "that",

90 "their",

91 "them",

92 "then",

93 "there",

94 "these",

95 "they",

96 "this",

97 "those",

98 "through",

99 "under",

100 "very",

101 "well",

102 "were",

103 "what",

104 "where",

105 "which",

106 "while",

107 "with",

108 "would",

109 "your"

110 };

111 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))

112

113

114 /*

115 * Compare function

116 */

117 int intcmp(const void *rec1, const void *rec2) {

118 int i1, i2;

119

120 i1 = *(const int *)rec1;

121 i2 = *(const int *)rec2;

122

123 if (i1 > i2) return(1);

124 if (i1 < i2) return(-1);

125 return(0);

126 }

127

128

129 Array *wordbreaker(const char *text) {

130 const char *ptr;

131 const char *word_start;

132 const char *word_end;

133 char ch;

134 int word_len;

135 char word[256];

136 int i;

137 int word_crc;

138

139 if (text == NULL) { /* no NULL text please */

140 return(NULL);

141 }

142

143 if (text[0] == 0) { /* no empty text either */

144 return(NULL);

145 }

146

147 Array *found_tokens = array_new(sizeof(int));

148 if (found_tokens == NULL) {

149 return(NULL);

150 }

151

152 ptr = text;

153 word_start = NULL;

154 while (*ptr) {

155 ch = *ptr;

156 if (isalnum(ch)) {

157 if (!word_start) {

158 word_start = ptr;

159 }

160 }

161 ++ptr;

162 ch = *ptr;

163 if ( (!isalnum(ch)) && (word_start) ) {

164 word_end = ptr;

165

166 /* extract the word */

167 word_len = word_end - word_start;

168 if (word_len >= sizeof word) {

169 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);

170 safestrncpy(word, word_start, sizeof word);

171 word[(sizeof word) - 1] = 0;

172 }

173 else {

174 safestrncpy(word, word_start, word_len+1);

175 word[word_len] = 0;

176 }

177 word_start = NULL;

178

179 /* are we ok with the length? */

180 if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {

181 for (i=0; i<word_len; ++i) {

182 word[i] = tolower(word[i]);

183 }

184 /* disqualify noise words */

185 for (i=0; i<NUM_NOISE; ++i) {

186 if (!strcasecmp(word, noise_words[i])) {

187 word_len = 0;

188 break;

189 }

190 }

191 /* FIXME make this case insensitive */

192 /* add it to the array */

193 if (word_len > 0) {

194 word_crc = (int) CalcCRC16Bytes(word_len, word);

195 array_append(found_tokens, &word_crc);

196 }

197 }

198 }

199 }

200

201 /* sort and purge dups */

202 if (array_len(found_tokens) > 1) {

203 array_sort(found_tokens, intcmp);

204 for (i=0; i<(array_len(found_tokens)); ++i) {

205 if (array_get_element_at(found_tokens, i) == array_get_element_at(found_tokens, i+1)) {

206 array_delete_element_at(found_tokens, i);

207 --i;

208 }

209 }

210 }

211 return(found_tokens);

212 }

213