citadel/ft_wordbreaker.c

   1 /*
   2  * $Id$
   3  *
   4  * Default wordbreaker module for full text indexing.
   5  *
   6  */
   7
   8
   9 #include "sysdep.h"
  10 #include <stdlib.h>
  11 #include <unistd.h>
  12 #include <stdio.h>
  13 #include <fcntl.h>
  14 #include <signal.h>
  15 #include <pwd.h>
  16 #include <errno.h>
  17 #include <sys/types.h>
  18
  19 #if TIME_WITH_SYS_TIME
  20 # include <sys/time.h>
  21 # include <time.h>
  22 #else
  23 # if HAVE_SYS_TIME_H
  24 #  include <sys/time.h>
  25 # else
  26 #  include <time.h>
  27 # endif
  28 #endif
  29
  30 #include <sys/wait.h>
  31 #include <ctype.h>
  32 #include <string.h>
  33 #include <limits.h>
  34 #include "citadel.h"
  35 #include "server.h"
  36 #include "sysdep_decls.h"
  37 #include "citserver.h"
  38 #include "support.h"
  39 #include "config.h"
  40 #include "serv_extensions.h"
  41 #include "database.h"
  42 #include "msgbase.h"
  43 #include "control.h"
  44 #include "tools.h"
  45 #include "ft_wordbreaker.h"
  46 #include "crc16.h"
  47
  48 /*
  49  * Noise words are not included in search indices.
  50  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  51  * must also be changed, so that the index is rebuilt.
  52  */
  53 static char *noise_words[] = {
  54         "about",
  55         "after",
  56         "all",
  57         "also",
  58         "an",
  59         "and",
  60         "another",
  61         "any",
  62         "are",
  63         "as",
  64         "at",
  65         "be",
  66         "because",
  67         "been",
  68         "before",
  69         "being",
  70         "between",
  71         "both",
  72         "but",
  73         "by",
  74         "came",
  75         "can",
  76         "come",
  77         "could",
  78         "did",
  79         "do",
  80         "each",
  81         "for",
  82         "from",
  83         "get",
  84         "got",
  85         "had",
  86         "has",
  87         "have",
  88         "he",
  89         "her",
  90         "here",
  91         "him",
  92         "himself",
  93         "his",
  94         "how",
  95         "if",
  96         "in",
  97         "into",
  98         "is",
  99         "it",
 100         "like",
 101         "make",
 102         "many",
 103         "me",
 104         "might",
 105         "more",
 106         "most",
 107         "much",
 108         "must",
 109         "my",
 110         "never",
 111         "now",
 112         "of",
 113         "on",
 114         "only",
 115         "or",
 116         "other",
 117         "our",
 118         "out",
 119         "over",
 120         "said",
 121         "same",
 122         "see",
 123         "should",
 124         "since",
 125         "some",
 126         "still",
 127         "such",
 128         "take",
 129         "than",
 130         "that",
 131         "the",
 132         "their",
 133         "them",
 134         "then",
 135         "there",
 136         "these",
 137         "they",
 138         "this",
 139         "those",
 140         "through",
 141         "to",
 142         "too",
 143         "under",
 144         "up",
 145         "very",
 146         "was",
 147         "way",
 148         "we",
 149         "well",
 150         "were",
 151         "what",
 152         "where",
 153         "which",
 154         "while",
 155         "with",
 156         "would",
 157         "you",
 158         "your"
 159 };
 160
 161 /*
 162  * Compare function
 163  */
 164 int intcmp(const void *rec1, const void *rec2) {
 165         int i1, i2;
 166
 167         i1 = *(const int *)rec1;
 168         i2 = *(const int *)rec2;
 169
 170         if (i1 > i2) return(1);
 171         if (i1 < i2) return(-1);
 172         return(0);
 173 }
 174
 175
 176 void wordbreaker(char *text, int *num_tokens, int **tokens) {
 177
 178         int wb_num_tokens = 0;
 179         int wb_num_alloc = 0;
 180         int *wb_tokens = NULL;
 181
 182         char *ptr;
 183         char *word_start;
 184         char *word_end;
 185         char ch;
 186         int word_len;
 187         char word[256];
 188         int i;
 189         int word_crc;
 190
 191         if (text == NULL) {             /* no NULL text please */
 192                 *num_tokens = 0;
 193                 *tokens = NULL;
 194                 return;
 195         }
 196
 197         if (text[0] == 0) {             /* no empty text either */
 198                 *num_tokens = 0;
 199                 *tokens = NULL;
 200                 return;
 201         }
 202
 203         ptr = text;
 204         word_start = NULL;
 205         while (*ptr) {
 206                 ch = *ptr;
 207                 if (isalnum(ch)) {
 208                         if (!word_start) {
 209                                 word_start = ptr;
 210                         }
 211                 }
 212                 ++ptr;
 213                 ch = *ptr;
 214                 if ( (!isalnum(ch)) && (word_start) ) {
 215                         word_end = ptr;
 216                         --word_end;
 217
 218                         /* extract the word */
 219                         word_len = word_end - word_start + 1;
 220                         safestrncpy(word, word_start, sizeof word);
 221                         word[word_len] = 0;
 222                         word_start = NULL;
 223
 224                         /* disqualify noise words */
 225                         for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
 226                                 if (!strcasecmp(word, noise_words[i])) {
 227                                         word_len = 0;
 228                                         break;
 229                                 }
 230                         }
 231
 232                         /* are we ok with the length? */
 233                         if ( (word_len >= WB_MIN)
 234                            && (word_len <= WB_MAX) ) {
 235                                 for (i=0; i<word_len; ++i) {
 236                                         word[i] = tolower(word[i]);
 237                                 }
 238                                 word_crc = (int)
 239                                         CalcCRC16Bytes(word_len, word);
 240
 241                                 ++wb_num_tokens;
 242                                 if (wb_num_tokens > wb_num_alloc) {
 243                                         wb_num_alloc += 512;
 244                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
 245                                 }
 246                                 wb_tokens[wb_num_tokens - 1] = word_crc;
 247                         }
 248                 }
 249         }
 250
 251         /* sort and purge dups */
 252         if (wb_num_tokens > 1) {
 253                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
 254                 for (i=0; i<(wb_num_tokens-1); ++i) {
 255                         if (wb_tokens[i] == wb_tokens[i+1]) {
 256                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
 257                                         ((wb_num_tokens - i - 1)*sizeof(int)));
 258                                 --wb_num_tokens;
 259                                 --i;
 260                         }
 261                 }
 262         }
 263
 264         *num_tokens = wb_num_tokens;
 265         *tokens = wb_tokens;
 266 }
 267