citadel/ft_wordbreaker.c

   1 /*
   2  * $Id$
   3  *
   4  * Default wordbreaker module for full text indexing.
   5  *
   6  */
   7
   8
   9 #include "sysdep.h"
  10 #include <stdlib.h>
  11 #include <unistd.h>
  12 #include <stdio.h>
  13 #include <fcntl.h>
  14 #include <signal.h>
  15 #include <pwd.h>
  16 #include <errno.h>
  17 #include <sys/types.h>
  18
  19 #if TIME_WITH_SYS_TIME
  20 # include <sys/time.h>
  21 # include <time.h>
  22 #else
  23 # if HAVE_SYS_TIME_H
  24 #  include <sys/time.h>
  25 # else
  26 #  include <time.h>
  27 # endif
  28 #endif
  29
  30 #include <sys/wait.h>
  31 #include <ctype.h>
  32 #include <string.h>
  33 #include <limits.h>
  34 #include "citadel.h"
  35 #include "server.h"
  36 #include "sysdep_decls.h"
  37 #include "citserver.h"
  38 #include "support.h"
  39 #include "config.h"
  40 #include "serv_extensions.h"
  41 #include "database.h"
  42 #include "msgbase.h"
  43 #include "control.h"
  44 #include "tools.h"
  45 #include "ft_wordbreaker.h"
  46 #include "crc16.h"
  47
  48 /*
  49  * Noise words are not included in search indices.
  50  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  51  * must also be changed, so that the index is rebuilt.
  52  */
  53 static char *noise_words[] = {
  54         "about",
  55         "after",
  56         "all",
  57         "also",
  58         "an",
  59         "and",
  60         "another",
  61         "any",
  62         "are",
  63         "as",
  64         "at",
  65         "be",
  66         "because",
  67         "been",
  68         "before",
  69         "being",
  70         "between",
  71         "both",
  72         "but",
  73         "by",
  74         "came",
  75         "can",
  76         "come",
  77         "could",
  78         "did",
  79         "do",
  80         "each",
  81         "for",
  82         "from",
  83         "get",
  84         "got",
  85         "had",
  86         "has",
  87         "have",
  88         "he",
  89         "her",
  90         "here",
  91         "him",
  92         "himself",
  93         "his",
  94         "how",
  95         "if",
  96         "in",
  97         "into",
  98         "is",
  99         "it",
 100         "like",
 101         "make",
 102         "many",
 103         "me",
 104         "might",
 105         "more",
 106         "most",
 107         "much",
 108         "must",
 109         "my",
 110         "never",
 111         "now",
 112         "of",
 113         "on",
 114         "only",
 115         "or",
 116         "other",
 117         "our",
 118         "out",
 119         "over",
 120         "said",
 121         "same",
 122         "see",
 123         "should",
 124         "since",
 125         "some",
 126         "still",
 127         "such",
 128         "take",
 129         "than",
 130         "that",
 131         "the",
 132         "their",
 133         "them",
 134         "then",
 135         "there",
 136         "these",
 137         "they",
 138         "this",
 139         "those",
 140         "through",
 141         "to",
 142         "too",
 143         "under",
 144         "up",
 145         "very",
 146         "was",
 147         "way",
 148         "we",
 149         "well",
 150         "were",
 151         "what",
 152         "where",
 153         "which",
 154         "while",
 155         "with",
 156         "would",
 157         "you",
 158         "your"
 159 };
 160
 161 /*
 162  * Compare function
 163  */
 164 int intcmp(const void *rec1, const void *rec2) {
 165         int i1, i2;
 166
 167         i1 = *(const int *)rec1;
 168         i2 = *(const int *)rec2;
 169
 170         if (i1 > i2) return(1);
 171         if (i1 < i2) return(-1);
 172         return(0);
 173 }
 174
 175
 176 void wordbreaker(char *text, int *num_tokens, int **tokens) {
 177
 178         int wb_num_tokens = 0;
 179         int wb_num_alloc = 0;
 180         int *wb_tokens = NULL;
 181
 182         char *ptr;
 183         char *word_start;
 184         char *word_end;
 185         char ch;
 186         int word_len;
 187         char word[256];
 188         int i;
 189         int word_crc;
 190
 191         if (text == NULL) {             /* no NULL text please */
 192                 *num_tokens = 0;
 193                 *tokens = NULL;
 194                 return;
 195         }
 196
 197         if (text[0] == 0) {             /* no empty text either */
 198                 *num_tokens = 0;
 199                 *tokens = NULL;
 200                 return;
 201         }
 202
 203         ptr = text;
 204         word_start = NULL;
 205         while (*ptr) {
 206                 ch = *ptr;
 207                 if (isalnum(ch)) {
 208                         if (!word_start) {
 209                                 word_start = ptr;
 210                         }
 211                 }
 212                 ++ptr;
 213                 ch = *ptr;
 214                 if ( (!isalnum(ch)) && (word_start) ) {
 215                         word_end = ptr;
 216                         --word_end;
 217
 218                         /* extract the word */
 219                         word_len = word_end - word_start + 1;
 220                         safestrncpy(word, word_start, sizeof word);
 221                         if (word_len >= sizeof word) {
 222                                 lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
 223                                 word[(sizeof word_len) - 1] = 0;
 224                         }
 225                         else {
 226                                 word[word_len] = 0;
 227                         }
 228                         word_start = NULL;
 229
 230                         /* disqualify noise words */
 231                         for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
 232                                 if (!strcasecmp(word, noise_words[i])) {
 233                                         word_len = 0;
 234                                         break;
 235                                 }
 236                         }
 237
 238                         /* are we ok with the length? */
 239                         if ( (word_len >= WB_MIN)
 240                            && (word_len <= WB_MAX) ) {
 241                                 for (i=0; i<word_len; ++i) {
 242                                         word[i] = tolower(word[i]);
 243                                 }
 244                                 word_crc = (int)
 245                                         CalcCRC16Bytes(word_len, word);
 246
 247                                 ++wb_num_tokens;
 248                                 if (wb_num_tokens > wb_num_alloc) {
 249                                         wb_num_alloc += 512;
 250                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
 251                                 }
 252                                 wb_tokens[wb_num_tokens - 1] = word_crc;
 253                         }
 254                 }
 255         }
 256
 257         /* sort and purge dups */
 258         if (wb_num_tokens > 1) {
 259                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
 260                 for (i=0; i<(wb_num_tokens-1); ++i) {
 261                         if (wb_tokens[i] == wb_tokens[i+1]) {
 262                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
 263                                         ((wb_num_tokens - i - 1)*sizeof(int)));
 264                                 --wb_num_tokens;
 265                                 --i;
 266                         }
 267                 }
 268         }
 269
 270         *num_tokens = wb_num_tokens;
 271         *tokens = wb_tokens;
 272 }
 273