citadel/modules/fulltext/ft_wordbreaker.c

   1 /*
   2  * $Id$
   3  *
   4  * Default wordbreaker module for full text indexing.
   5  *
   6  */
   7
   8
   9 #include "sysdep.h"
  10 #include <stdlib.h>
  11 #include <unistd.h>
  12 #include <stdio.h>
  13 #include <fcntl.h>
  14 #include <signal.h>
  15 #include <pwd.h>
  16 #include <errno.h>
  17 #include <sys/types.h>
  18
  19 #if TIME_WITH_SYS_TIME
  20 # include <sys/time.h>
  21 # include <time.h>
  22 #else
  23 # if HAVE_SYS_TIME_H
  24 #  include <sys/time.h>
  25 # else
  26 #  include <time.h>
  27 # endif
  28 #endif
  29
  30 #include <sys/wait.h>
  31 #include <ctype.h>
  32 #include <string.h>
  33 #include <limits.h>
  34 #include <libcitadel.h>
  35 #include "citadel.h"
  36 #include "server.h"
  37 #include "sysdep_decls.h"
  38 #include "citserver.h"
  39 #include "support.h"
  40 #include "config.h"
  41 #include "database.h"
  42 #include "msgbase.h"
  43 #include "control.h"
  44 #include "ft_wordbreaker.h"
  45 #include "crc16.h"
  46 #include "ctdl_module.h"
  47
  48 /*
  49  * Noise words are not included in search indices.
  50  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  51  * must also be changed, so that the index is rebuilt.
  52  */
  53
  54 noise_word *noise_words[26];
  55
  56 static char *noise_words_init[] = {
  57         "about",
  58         "after",
  59         "also",
  60         "another",
  61         "because",
  62         "been",
  63         "before",
  64         "being",
  65         "between",
  66         "both",
  67         "came",
  68         "come",
  69         "could",
  70         "each",
  71         "from",
  72         "have",
  73         "here",
  74         "himself",
  75         "into",
  76         "like",
  77         "make",
  78         "many",
  79         "might",
  80         "more",
  81         "most",
  82         "much",
  83         "must",
  84         "never",
  85         "only",
  86         "other",
  87         "over",
  88         "said",
  89         "same",
  90         "should",
  91         "since",
  92         "some",
  93         "still",
  94         "such",
  95         "take",
  96         "than",
  97         "that",
  98         "their",
  99         "them",
 100         "then",
 101         "there",
 102         "these",
 103         "they",
 104         "this",
 105         "those",
 106         "through",
 107         "under",
 108         "very",
 109         "well",
 110         "were",
 111         "what",
 112         "where",
 113         "which",
 114         "while",
 115         "with",
 116         "would",
 117         "your"
 118 };
 119
 120
 121 void initialize_noise_words(void)
 122 {
 123         int i;
 124         int len;
 125         int ch;
 126         noise_word *next;
 127
 128         memset (noise_words, 0, sizeof(noise_words));
 129
 130         for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
 131         {
 132                 ch = noise_words_init[i][0] - 'a';
 133                 len = strlen(noise_words_init[i]);
 134
 135                 next = malloc(sizeof(noise_word));
 136                 next->len = len;
 137                 next->word = strdup(noise_words_init[i]);
 138                 next->next = noise_words[ch];
 139                 noise_words[ch] = next;
 140         }
 141 }
 142
 143
 144 void noise_word_cleanup(void)
 145 {
 146         int i;
 147         noise_word *cur, *next;
 148
 149         CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
 150
 151         for (i = 0 ; i < 26 ; i++)
 152         {
 153                 cur = noise_words[i];
 154                 while (cur)
 155                 {
 156                         next = cur->next;
 157                         free(cur->word);
 158                         free(cur);
 159                         cur = next;
 160                 }
 161         }
 162 }
 163
 164 /*
 165  * Compare function
 166  */
 167 int intcmp(const void *rec1, const void *rec2) {
 168         int i1, i2;
 169
 170         i1 = *(const int *)rec1;
 171         i2 = *(const int *)rec2;
 172
 173         if (i1 > i2) return(1);
 174         if (i1 < i2) return(-1);
 175         return(0);
 176 }
 177
 178
 179 void wordbreaker(char *text, int *num_tokens, int **tokens) {
 180
 181         int wb_num_tokens = 0;
 182         int wb_num_alloc = 0;
 183         int *wb_tokens = NULL;
 184
 185         char *ptr;
 186         char *word_start;
 187         char *word_end;
 188         char ch;
 189         int word_len;
 190         char word[256];
 191         int i;
 192         int word_crc;
 193         noise_word *noise;
 194
 195
 196         if (text == NULL) {             /* no NULL text please */
 197                 *num_tokens = 0;
 198                 *tokens = NULL;
 199                 return;
 200         }
 201
 202         if (text[0] == 0) {             /* no empty text either */
 203                 *num_tokens = 0;
 204                 *tokens = NULL;
 205                 return;
 206         }
 207
 208         ptr = text;
 209         word_start = NULL;
 210         while (*ptr) {
 211                 ch = *ptr;
 212                 if (isalnum(ch)) {
 213                         if (!word_start) {
 214                                 word_start = ptr;
 215                         }
 216                 }
 217                 ++ptr;
 218                 ch = *ptr;
 219                 if ( (!isalnum(ch)) && (word_start) ) {
 220                         word_end = ptr;
 221 //                      --word_end;
 222
 223                         /* extract the word */
 224                         word_len = word_end - word_start;
 225                         if (word_len >= sizeof word) {
 226                                 CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
 227                                 safestrncpy(word, word_start, sizeof word);
 228                                 word[(sizeof word) - 1] = 0;
 229                         }
 230                         else {
 231                                 safestrncpy(word, word_start, word_len+1);
 232                                 word[word_len] = 0;
 233                         }
 234                         word_start = NULL;
 235
 236                         /* are we ok with the length? */
 237                         if ( (word_len >= WB_MIN)
 238                            && (word_len <= WB_MAX) ) {
 239                                 for (i=0; i<word_len; ++i) {
 240                                         word[i] = tolower(word[i]);
 241                                 }
 242                                 /* disqualify noise words */
 243                                 noise = noise_words[(int) (word[0]-'a')];
 244                                 while (noise)
 245                                 {
 246                                         if (noise->len == word_len)
 247                                         {
 248                                                 if (!strcmp(word, noise->word))
 249                                                 {
 250                                                         word_len = 0;
 251                                                         break;
 252                                                 }
 253                                         }
 254                                         noise = noise->next;
 255                                 }
 256                                 if (word_len == 0)
 257                                         continue;
 258
 259                                 word_crc = (int)
 260                                         CalcCRC16Bytes(word_len, word);
 261
 262                                 ++wb_num_tokens;
 263                                 if (wb_num_tokens > wb_num_alloc) {
 264                                         wb_num_alloc += 512;
 265                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
 266                                 }
 267                                 wb_tokens[wb_num_tokens - 1] = word_crc;
 268                         }
 269                 }
 270         }
 271
 272         /* sort and purge dups */
 273         if (wb_num_tokens > 1) {
 274                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
 275                 for (i=0; i<(wb_num_tokens-1); ++i) {
 276                         if (wb_tokens[i] == wb_tokens[i+1]) {
 277                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
 278                                         ((wb_num_tokens - i - 1)*sizeof(int)));
 279                                 --wb_num_tokens;
 280                                 --i;
 281                         }
 282                 }
 283         }
 284
 285         *num_tokens = wb_num_tokens;
 286         *tokens = wb_tokens;
 287 }
 288