citadel/modules/fulltext/ft_wordbreaker.c

   1 /*
   2  * Default wordbreaker module for full text indexing.
   3  *
   4  * Copyright (c) 2005-2017 by the citadel.org team
   5  *
   6  * This program is open source software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 3.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  */
  14
  15 #include "sysdep.h"
  16 #include <stdlib.h>
  17 #include <unistd.h>
  18 #include <stdio.h>
  19 #include <fcntl.h>
  20 #include <signal.h>
  21 #include <pwd.h>
  22 #include <errno.h>
  23 #include <sys/types.h>
  24
  25 #if TIME_WITH_SYS_TIME
  26 # include <sys/time.h>
  27 # include <time.h>
  28 #else
  29 # if HAVE_SYS_TIME_H
  30 #  include <sys/time.h>
  31 # else
  32 #  include <time.h>
  33 # endif
  34 #endif
  35
  36 #include <sys/wait.h>
  37 #include <ctype.h>
  38 #include <string.h>
  39 #include <limits.h>
  40 #include <libcitadel.h>
  41 #include "citadel.h"
  42 #include "server.h"
  43 #include "sysdep_decls.h"
  44 #include "citserver.h"
  45 #include "support.h"
  46 #include "config.h"
  47 #include "database.h"
  48 #include "msgbase.h"
  49 #include "control.h"
  50 #include "ft_wordbreaker.h"
  51 #include "crc16.h"
  52 #include "ctdl_module.h"
  53
  54 /*
  55  * Noise words are not included in search indices.
  56  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  57  * must also be changed, so that the index is rebuilt.
  58  */
  59 static char *noise_words[] = {
  60         "about",
  61         "after",
  62         "also",
  63         "another",
  64         "because",
  65         "been",
  66         "before",
  67         "being",
  68         "between",
  69         "both",
  70         "came",
  71         "come",
  72         "could",
  73         "each",
  74         "from",
  75         "have",
  76         "here",
  77         "himself",
  78         "into",
  79         "like",
  80         "make",
  81         "many",
  82         "might",
  83         "more",
  84         "most",
  85         "much",
  86         "must",
  87         "never",
  88         "only",
  89         "other",
  90         "over",
  91         "said",
  92         "same",
  93         "should",
  94         "since",
  95         "some",
  96         "still",
  97         "such",
  98         "take",
  99         "than",
 100         "that",
 101         "their",
 102         "them",
 103         "then",
 104         "there",
 105         "these",
 106         "they",
 107         "this",
 108         "those",
 109         "through",
 110         "under",
 111         "very",
 112         "well",
 113         "were",
 114         "what",
 115         "where",
 116         "which",
 117         "while",
 118         "with",
 119         "would",
 120         "your"
 121 };
 122 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
 123
 124
 125 /*
 126  * Compare function
 127  */
 128 int intcmp(const void *rec1, const void *rec2) {
 129         int i1, i2;
 130
 131         i1 = *(const int *)rec1;
 132         i2 = *(const int *)rec2;
 133
 134         if (i1 > i2) return(1);
 135         if (i1 < i2) return(-1);
 136         return(0);
 137 }
 138
 139
 140 void wordbreaker(const char *text, int *num_tokens, int **tokens) {
 141
 142         int wb_num_tokens = 0;
 143         int wb_num_alloc = 0;
 144         int *wb_tokens = NULL;
 145
 146         const char *ptr;
 147         const char *word_start;
 148         const char *word_end;
 149         char ch;
 150         int word_len;
 151         char word[256];
 152         int i;
 153         int word_crc;
 154
 155         if (text == NULL) {             /* no NULL text please */
 156                 *num_tokens = 0;
 157                 *tokens = NULL;
 158                 return;
 159         }
 160
 161         if (text[0] == 0) {             /* no empty text either */
 162                 *num_tokens = 0;
 163                 *tokens = NULL;
 164                 return;
 165         }
 166
 167         ptr = text;
 168         word_start = NULL;
 169         while (*ptr) {
 170                 ch = *ptr;
 171                 if (isalnum(ch)) {
 172                         if (!word_start) {
 173                                 word_start = ptr;
 174                         }
 175                 }
 176                 ++ptr;
 177                 ch = *ptr;
 178                 if ( (!isalnum(ch)) && (word_start) ) {
 179                         word_end = ptr;
 180
 181                         /* extract the word */
 182                         word_len = word_end - word_start;
 183                         if (word_len >= sizeof word) {
 184                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
 185                                 safestrncpy(word, word_start, sizeof word);
 186                                 word[(sizeof word) - 1] = 0;
 187                         }
 188                         else {
 189                                 safestrncpy(word, word_start, word_len+1);
 190                                 word[word_len] = 0;
 191                         }
 192                         word_start = NULL;
 193
 194                         /* are we ok with the length? */
 195                         if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
 196                                 for (i=0; i<word_len; ++i) {
 197                                         word[i] = tolower(word[i]);
 198                                 }
 199                                 /* disqualify noise words */
 200                                 for (i=0; i<NUM_NOISE; ++i) {
 201                                         if (!strcmp(word, noise_words[i])) {
 202                                                 word_len = 0;
 203                                                 break;
 204                                         }
 205                                 }
 206
 207                                 if (word_len == 0)
 208                                         continue;
 209
 210                                 word_crc = (int) CalcCRC16Bytes(word_len, word);
 211
 212                                 ++wb_num_tokens;
 213                                 if (wb_num_tokens > wb_num_alloc) {
 214                                         wb_num_alloc += 512;
 215                                         wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
 216                                 }
 217                                 wb_tokens[wb_num_tokens - 1] = word_crc;
 218                         }
 219                 }
 220         }
 221
 222         /* sort and purge dups */
 223         if (wb_num_tokens > 1) {
 224                 qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
 225                 for (i=0; i<(wb_num_tokens-1); ++i) {
 226                         if (wb_tokens[i] == wb_tokens[i+1]) {
 227                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
 228                                         ((wb_num_tokens - i - 1)*sizeof(int)));
 229                                 --wb_num_tokens;
 230                                 --i;
 231                         }
 232                 }
 233         }
 234
 235         *num_tokens = wb_num_tokens;
 236         *tokens = wb_tokens;
 237 }
 238