citadel/modules/fulltext/ft_wordbreaker.c

   1 /*
   2  * Default wordbreaker module for full text indexing.
   3  *
   4  * Copyright (c) 2005-2017 by the citadel.org team
   5  *
   6  * This program is open source software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 3.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  */
  14
  15 #include "sysdep.h"
  16 #include <stdlib.h>
  17 #include <unistd.h>
  18 #include <stdio.h>
  19 #include <fcntl.h>
  20 #include <signal.h>
  21 #include <pwd.h>
  22 #include <errno.h>
  23 #include <sys/types.h>
  24
  25 #if TIME_WITH_SYS_TIME
  26 # include <sys/time.h>
  27 # include <time.h>
  28 #else
  29 # if HAVE_SYS_TIME_H
  30 #  include <sys/time.h>
  31 # else
  32 #  include <time.h>
  33 # endif
  34 #endif
  35
  36 #include <sys/wait.h>
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <limits.h>
  40 #include <libcitadel.h>
  41 #include "citadel.h"
  42 #include "server.h"
  43 #include "sysdep_decls.h"
  44 #include "citserver.h"
  45 #include "support.h"
  46 #include "config.h"
  47 #include "database.h"
  48 #include "msgbase.h"
  49 #include "control.h"
  50 #include "ft_wordbreaker.h"
  51 #include "crc16.h"
  52 #include "ctdl_module.h"
  53
  54 /*
  55  * Noise words are not included in search indices.
  56  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  57  * must also be changed, so that the index is rebuilt.
  58  */
  59
  60 noise_word *noise_words[26];
  61
  62 static char *noise_words_init[] = {
  63         "about",
  64         "after",
  65         "also",
  66         "another",
  67         "because",
  68         "been",
  69         "before",
  70         "being",
  71         "between",
  72         "both",
  73         "came",
  74         "come",
  75         "could",
  76         "each",
  77         "from",
  78         "have",
  79         "here",
  80         "himself",
  81         "into",
  82         "like",
  83         "make",
  84         "many",
  85         "might",
  86         "more",
  87         "most",
  88         "much",
  89         "must",
  90         "never",
  91         "only",
  92         "other",
  93         "over",
  94         "said",
  95         "same",
  96         "should",
  97         "since",
  98         "some",
  99         "still",
 100         "such",
 101         "take",
 102         "than",
 103         "that",
 104         "their",
 105         "them",
 106         "then",
 107         "there",
 108         "these",
 109         "they",
 110         "this",
 111         "those",
 112         "through",
 113         "under",
 114         "very",
 115         "well",
 116         "were",
 117         "what",
 118         "where",
 119         "which",
 120         "while",
 121         "with",
 122         "would",
 123         "your"
 124 };
 125
 126
 127 void initialize_noise_words(void)
 128 {
 129         int i;
 130         int len;
 131         int ch;
 132         noise_word *next;
 133
 134         memset (noise_words, 0, sizeof(noise_words));
 135
 136         for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
 137         {
 138                 ch = noise_words_init[i][0] - 'a';
 139                 len = strlen(noise_words_init[i]);
 140
 141                 next = malloc(sizeof(noise_word));
 142                 next->len = len;
 143                 next->word = strdup(noise_words_init[i]);
 144                 next->next = noise_words[ch];
 145                 noise_words[ch] = next;
 146         }
 147 }
 148
 149
 150 void noise_word_cleanup(void)
 151 {
 152         int i;
 153         noise_word *cur, *next;
 154
 155         syslog(LOG_INFO, "wordbreaker: cleaning up fulltext noise words");
 156
 157         for (i = 0 ; i < 26 ; i++)
 158         {
 159                 cur = noise_words[i];
 160                 while (cur)
 161                 {
 162                         next = cur->next;
 163                         free(cur->word);
 164                         free(cur);
 165                         cur = next;
 166                 }
 167         }
 168 }
 169
 170 /*
 171  * Compare function
 172  */
 173 int intcmp(const void *rec1, const void *rec2) {
 174         int i1, i2;
 175
 176         i1 = *(const int *)rec1;
 177         i2 = *(const int *)rec2;
 178
 179         if (i1 > i2) return(1);
 180         if (i1 < i2) return(-1);
 181         return(0);
 182 }
 183
 184
 185 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
 186
 187         int wb_num_tokens = 0;
 188         int wb_num_alloc = 0;
 189         int *wb_tokens = NULL;
 190
 191         const char *ptr;
 192         const char *word_start;
 193         const char *word_end;
 194         char ch;
 195         int word_len;
 196         char word[256];
 197         int i;
 198         int word_crc;
 199         noise_word *noise;
 200
 201
 202         if (text == NULL) {             /* no NULL text please */
 203                 *num_tokens = 0;
 204                 *tokens = NULL;
 205                 return;
 206         }
 207
 208         if (text[0] == 0) {             /* no empty text either */
 209                 *num_tokens = 0;
 210                 *tokens = NULL;
 211                 return;
 212         }
 213
 214         ptr = text;
 215         word_start = NULL;
 216         while (*ptr) {
 217                 ch = *ptr;
 218                 if (isalnum(ch)) {
 219                         if (!word_start) {
 220                                 word_start = ptr;
 221                         }
 222                 }
 223                 ++ptr;
 224                 ch = *ptr;
 225                 if ( (!isalnum(ch)) && (word_start) ) {
 226                         word_end = ptr;
 227
 228                         /* extract the word */
 229                         word_len = word_end - word_start;
 230                         if (word_len >= sizeof word) {
 231                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
 232                                 safestrncpy(word, word_start, sizeof word);
 233                                 word[(sizeof word) - 1] = 0;
 234                         }
 235                         else {
 236                                 safestrncpy(word, word_start, word_len+1);
 237                                 word[word_len] = 0;
 238                         }
 239                         word_start = NULL;
 240
 241                         /* are we ok with the length? */
 242                         if ( (word_len >= WB_MIN)
 243                            && (word_len <= WB_MAX) ) {
 244                                 for (i=0; i<word_len; ++i) {
 245                                         word[i] = tolower(word[i]);
 246                                 }
 247                                 /* disqualify noise words */
 248                                 noise = noise_words[(int) (word[0]-'a')];
 249                                 while (noise)
 250                                 {
 251                                         if (noise->len == word_len)
 252                                         {
 253                                                 if (!strcmp(word, noise->word))
 254                                                 {
 255                                                         word_len = 0;
 256                                                         break;
 257                                                 }
 258                                         }
 259                                         noise = noise->next;
 260                                 }
 261                                 if (word_len == 0)
 262                                         continue;
 263
 264                                 word_crc = (int)
 265                                         CalcCRC16Bytes(word_len, word);
 266
 267                                 ++wb_num_tokens;
 268                                 if (wb_num_tokens > wb_num_alloc) {
 269                                         wb_num_alloc += 512;
 270                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
 271                                 }
 272                                 wb_tokens[wb_num_tokens - 1] = word_crc;
 273                         }
 274                 }
 275         }
 276
 277         /* sort and purge dups */
 278         if (wb_num_tokens > 1) {
 279                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
 280                 for (i=0; i<(wb_num_tokens-1); ++i) {
 281                         if (wb_tokens[i] == wb_tokens[i+1]) {
 282                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
 283                                         ((wb_num_tokens - i - 1)*sizeof(int)));
 284                                 --wb_num_tokens;
 285                                 --i;
 286                         }
 287                 }
 288         }
 289
 290         *num_tokens = wb_num_tokens;
 291         *tokens = wb_tokens;
 292 }
 293