citadel/modules/fulltext/ft_wordbreaker.c

   1 /*
   2  * Default wordbreaker module for full text indexing.
   3  *
   4  * Copyright (c) 2005-2012 by the citadel.org team
   5  *
   6  *  This program is open source software; you can redistribute it and/or modify
   7  *  it under the terms of the GNU General Public License version 3.
   8  *
   9  *
  10  *
  11  *  This program is distributed in the hope that it will be useful,
  12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  *  GNU General Public License for more details.
  15  *
  16  *
  17  *
  18  *
  19  */
  20
  21
  22 #include "sysdep.h"
  23 #include <stdlib.h>
  24 #include <unistd.h>
  25 #include <stdio.h>
  26 #include <fcntl.h>
  27 #include <signal.h>
  28 #include <pwd.h>
  29 #include <errno.h>
  30 #include <sys/types.h>
  31
  32 #if TIME_WITH_SYS_TIME
  33 # include <sys/time.h>
  34 # include <time.h>
  35 #else
  36 # if HAVE_SYS_TIME_H
  37 #  include <sys/time.h>
  38 # else
  39 #  include <time.h>
  40 # endif
  41 #endif
  42
  43 #include <sys/wait.h>
  44 #include <ctype.h>
  45 #include <string.h>
  46 #include <limits.h>
  47 #include <libcitadel.h>
  48 #include "citadel.h"
  49 #include "server.h"
  50 #include "sysdep_decls.h"
  51 #include "citserver.h"
  52 #include "support.h"
  53 #include "config.h"
  54 #include "database.h"
  55 #include "msgbase.h"
  56 #include "control.h"
  57 #include "ft_wordbreaker.h"
  58 #include "crc16.h"
  59 #include "ctdl_module.h"
  60
  61 /*
  62  * Noise words are not included in search indices.
  63  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  64  * must also be changed, so that the index is rebuilt.
  65  */
  66
  67 noise_word *noise_words[26];
  68
  69 static char *noise_words_init[] = {
  70         "about",
  71         "after",
  72         "also",
  73         "another",
  74         "because",
  75         "been",
  76         "before",
  77         "being",
  78         "between",
  79         "both",
  80         "came",
  81         "come",
  82         "could",
  83         "each",
  84         "from",
  85         "have",
  86         "here",
  87         "himself",
  88         "into",
  89         "like",
  90         "make",
  91         "many",
  92         "might",
  93         "more",
  94         "most",
  95         "much",
  96         "must",
  97         "never",
  98         "only",
  99         "other",
 100         "over",
 101         "said",
 102         "same",
 103         "should",
 104         "since",
 105         "some",
 106         "still",
 107         "such",
 108         "take",
 109         "than",
 110         "that",
 111         "their",
 112         "them",
 113         "then",
 114         "there",
 115         "these",
 116         "they",
 117         "this",
 118         "those",
 119         "through",
 120         "under",
 121         "very",
 122         "well",
 123         "were",
 124         "what",
 125         "where",
 126         "which",
 127         "while",
 128         "with",
 129         "would",
 130         "your"
 131 };
 132
 133
 134 void initialize_noise_words(void)
 135 {
 136         int i;
 137         int len;
 138         int ch;
 139         noise_word *next;
 140
 141         memset (noise_words, 0, sizeof(noise_words));
 142
 143         for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
 144         {
 145                 ch = noise_words_init[i][0] - 'a';
 146                 len = strlen(noise_words_init[i]);
 147
 148                 next = malloc(sizeof(noise_word));
 149                 next->len = len;
 150                 next->word = strdup(noise_words_init[i]);
 151                 next->next = noise_words[ch];
 152                 noise_words[ch] = next;
 153         }
 154 }
 155
 156
 157 void noise_word_cleanup(void)
 158 {
 159         int i;
 160         noise_word *cur, *next;
 161
 162         syslog(LOG_INFO, "Cleaning up fulltext noise words.\n");
 163
 164         for (i = 0 ; i < 26 ; i++)
 165         {
 166                 cur = noise_words[i];
 167                 while (cur)
 168                 {
 169                         next = cur->next;
 170                         free(cur->word);
 171                         free(cur);
 172                         cur = next;
 173                 }
 174         }
 175 }
 176
 177 /*
 178  * Compare function
 179  */
 180 int intcmp(const void *rec1, const void *rec2) {
 181         int i1, i2;
 182
 183         i1 = *(const int *)rec1;
 184         i2 = *(const int *)rec2;
 185
 186         if (i1 > i2) return(1);
 187         if (i1 < i2) return(-1);
 188         return(0);
 189 }
 190
 191
 192 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
 193
 194         int wb_num_tokens = 0;
 195         int wb_num_alloc = 0;
 196         int *wb_tokens = NULL;
 197
 198         const char *ptr;
 199         const char *word_start;
 200         const char *word_end;
 201         char ch;
 202         int word_len;
 203         char word[256];
 204         int i;
 205         int word_crc;
 206         noise_word *noise;
 207
 208
 209         if (text == NULL) {             /* no NULL text please */
 210                 *num_tokens = 0;
 211                 *tokens = NULL;
 212                 return;
 213         }
 214
 215         if (text[0] == 0) {             /* no empty text either */
 216                 *num_tokens = 0;
 217                 *tokens = NULL;
 218                 return;
 219         }
 220
 221         ptr = text;
 222         word_start = NULL;
 223         while (*ptr) {
 224                 ch = *ptr;
 225                 if (isalnum(ch)) {
 226                         if (!word_start) {
 227                                 word_start = ptr;
 228                         }
 229                 }
 230                 ++ptr;
 231                 ch = *ptr;
 232                 if ( (!isalnum(ch)) && (word_start) ) {
 233                         word_end = ptr;
 234 //                      --word_end;
 235
 236                         /* extract the word */
 237                         word_len = word_end - word_start;
 238                         if (word_len >= sizeof word) {
 239                                 syslog(LOG_DEBUG, "Invalid word length: %d\n", word_len);
 240                                 safestrncpy(word, word_start, sizeof word);
 241                                 word[(sizeof word) - 1] = 0;
 242                         }
 243                         else {
 244                                 safestrncpy(word, word_start, word_len+1);
 245                                 word[word_len] = 0;
 246                         }
 247                         word_start = NULL;
 248
 249                         /* are we ok with the length? */
 250                         if ( (word_len >= WB_MIN)
 251                            && (word_len <= WB_MAX) ) {
 252                                 for (i=0; i<word_len; ++i) {
 253                                         word[i] = tolower(word[i]);
 254                                 }
 255                                 /* disqualify noise words */
 256                                 noise = noise_words[(int) (word[0]-'a')];
 257                                 while (noise)
 258                                 {
 259                                         if (noise->len == word_len)
 260                                         {
 261                                                 if (!strcmp(word, noise->word))
 262                                                 {
 263                                                         word_len = 0;
 264                                                         break;
 265                                                 }
 266                                         }
 267                                         noise = noise->next;
 268                                 }
 269                                 if (word_len == 0)
 270                                         continue;
 271
 272                                 word_crc = (int)
 273                                         CalcCRC16Bytes(word_len, word);
 274
 275                                 ++wb_num_tokens;
 276                                 if (wb_num_tokens > wb_num_alloc) {
 277                                         wb_num_alloc += 512;
 278                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
 279                                 }
 280                                 wb_tokens[wb_num_tokens - 1] = word_crc;
 281                         }
 282                 }
 283         }
 284
 285         /* sort and purge dups */
 286         if (wb_num_tokens > 1) {
 287                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
 288                 for (i=0; i<(wb_num_tokens-1); ++i) {
 289                         if (wb_tokens[i] == wb_tokens[i+1]) {
 290                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
 291                                         ((wb_num_tokens - i - 1)*sizeof(int)));
 292                                 --wb_num_tokens;
 293                                 --i;
 294                         }
 295                 }
 296         }
 297
 298         *num_tokens = wb_num_tokens;
 299         *tokens = wb_tokens;
 300 }
 301