citadel/server/modules/fulltext/ft_wordbreaker.c

   1 /*
   2  * Default wordbreaker module for full text indexing.
   3  *
   4  * Copyright (c) 2005-2017 by the citadel.org team
   5  *
   6  * This program is open source software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 3.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  */
  14
  15 #include "../../sysdep.h"
  16 #include <stdlib.h>
  17 #include <unistd.h>
  18 #include <stdio.h>
  19 #include <fcntl.h>
  20 #include <signal.h>
  21 #include <pwd.h>
  22 #include <errno.h>
  23 #include <sys/types.h>
  24 #include <time.h>
  25 #include <sys/wait.h>
  26 #include <ctype.h>
  27 #include <string.h>
  28 #include <limits.h>
  29 #include <libcitadel.h>
  30 #include "../../citadel_defs.h"
  31 #include "../../server.h"
  32 #include "../../sysdep_decls.h"
  33 #include "../../citserver.h"
  34 #include "../../support.h"
  35 #include "../../config.h"
  36 #include "../../database.h"
  37 #include "../../msgbase.h"
  38 #include "../../control.h"
  39 #include "ft_wordbreaker.h"
  40 #include "crc16.h"
  41 #include "../../ctdl_module.h"
  42
  43 /*
  44  * Noise words are not included in search indices.
  45  * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
  46  * must also be changed, so that the index is rebuilt.
  47  */
  48 static char *noise_words[] = {
  49         "about",
  50         "after",
  51         "also",
  52         "another",
  53         "because",
  54         "been",
  55         "before",
  56         "being",
  57         "between",
  58         "both",
  59         "came",
  60         "come",
  61         "could",
  62         "each",
  63         "from",
  64         "have",
  65         "here",
  66         "himself",
  67         "into",
  68         "like",
  69         "make",
  70         "many",
  71         "might",
  72         "more",
  73         "most",
  74         "much",
  75         "must",
  76         "never",
  77         "only",
  78         "other",
  79         "over",
  80         "said",
  81         "same",
  82         "should",
  83         "since",
  84         "some",
  85         "still",
  86         "such",
  87         "take",
  88         "than",
  89         "that",
  90         "their",
  91         "them",
  92         "then",
  93         "there",
  94         "these",
  95         "they",
  96         "this",
  97         "those",
  98         "through",
  99         "under",
 100         "very",
 101         "well",
 102         "were",
 103         "what",
 104         "where",
 105         "which",
 106         "while",
 107         "with",
 108         "would",
 109         "your"
 110 };
 111 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
 112
 113
 114 /*
 115  * Compare function
 116  */
 117 int intcmp(const void *rec1, const void *rec2) {
 118         int i1, i2;
 119
 120         i1 = *(const int *)rec1;
 121         i2 = *(const int *)rec2;
 122
 123         if (i1 > i2) return(1);
 124         if (i1 < i2) return(-1);
 125         return(0);
 126 }
 127
 128
 129 Array *wordbreaker(const char *text) {
 130         const char *ptr;
 131         const char *word_start;
 132         const char *word_end;
 133         char ch;
 134         int word_len;
 135         char word[256];
 136         int i;
 137         int word_crc;
 138
 139         if (text == NULL) {             /* no NULL text please */
 140                 return(NULL);
 141         }
 142
 143         if (text[0] == 0) {             /* no empty text either */
 144                 return(NULL);
 145         }
 146
 147         Array *found_tokens = array_new(sizeof(int));
 148         if (found_tokens == NULL) {
 149                 return(NULL);
 150         }
 151
 152         ptr = text;
 153         word_start = NULL;
 154         while (*ptr) {
 155                 ch = *ptr;
 156                 if (isalnum(ch)) {
 157                         if (!word_start) {
 158                                 word_start = ptr;
 159                         }
 160                 }
 161                 ++ptr;
 162                 ch = *ptr;
 163                 if ( (!isalnum(ch)) && (word_start) ) {
 164                         word_end = ptr;
 165
 166                         /* extract the word */
 167                         word_len = word_end - word_start;
 168                         if (word_len >= sizeof word) {
 169                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
 170                                 safestrncpy(word, word_start, sizeof word);
 171                                 word[(sizeof word) - 1] = 0;
 172                         }
 173                         else {
 174                                 safestrncpy(word, word_start, word_len+1);
 175                                 word[word_len] = 0;
 176                         }
 177                         word_start = NULL;
 178
 179                         /* are we ok with the length? */
 180                         if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
 181                                 for (i=0; i<word_len; ++i) {
 182                                         word[i] = tolower(word[i]);
 183                                 }
 184                                 /* disqualify noise words */
 185                                 for (i=0; i<NUM_NOISE; ++i) {
 186                                         if (!strcasecmp(word, noise_words[i])) {
 187                                                 word_len = 0;
 188                                                 break;
 189                                         }
 190                                 }
 191                                 /* FIXME make this case insensitive */
 192                                 /* add it to the array */
 193                                 if (word_len > 0) {
 194                                         word_crc = (int) CalcCRC16Bytes(word_len, word);
 195                                         array_append(found_tokens, &word_crc);
 196                                 }
 197                         }
 198                 }
 199         }
 200
 201         /* sort and purge dups */
 202         if (array_len(found_tokens) > 1) {
 203                 array_sort(found_tokens, intcmp);
 204                 for (i=0; i<(array_len(found_tokens)); ++i) {
 205                         if (array_get_element_at(found_tokens, i) == array_get_element_at(found_tokens, i+1)) {
 206                                 array_delete_element_at(found_tokens, i);
 207                                 --i;
 208                         }
 209                 }
 210         }
 211         return(found_tokens);
 212 }
 213