stable now but there are GIANT PIECES MISSING

[citadel.git] / citadel / modules / fulltext / ft_wordbreaker.c
diff --git a/citadel/modules/fulltext/ft_wordbreaker.c b/citadel/modules/fulltext/ft_wordbreaker.c

index 664a7de8dc99c4b3794b8e051155d72209b23d2a..b236de362643d94d3b61c605c7586c63bb4a8306 100644 (file)
--- a/citadel/modules/fulltext/ft_wordbreaker.c
+++ b/citadel/modules/fulltext/ft_wordbreaker.c
@@ -1,11 +1,17 @@
  /*
- * $Id$
- *
   * Default wordbreaker module for full text indexing.
   *
+ * Copyright (c) 2005-2017 by the citadel.org team
+ *
+ * This program is open source software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
   */
  
-
  #include "sysdep.h"
  #include <stdlib.h>
  #include <unistd.h>
@@ -15,18 +21,7 @@
  #include <pwd.h>
  #include <errno.h>
  #include <sys/types.h>
-
-#if TIME_WITH_SYS_TIME
-# include <sys/time.h>
-# include <time.h>
-#else
-# if HAVE_SYS_TIME_H
-#  include <sys/time.h>
-# else
-#  include <time.h>
-# endif
-#endif
-
+#include <time.h>
  #include <sys/wait.h>
  #include <ctype.h>
  #include <string.h>
@@ -50,10 +45,7 @@
   * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
   * must also be changed, so that the index is rebuilt.
   */
-
-noise_word *noise_words[26];
-
-static char *noise_words_init[] = {
+static char *noise_words[] = {
         "about",
         "after",
         "also",
@@ -116,51 +108,9 @@ static char *noise_words_init[] = {
         "would",
         "your"
  };
+#define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
  
  
-void initialize_noise_words(void)
-{
-       int i;
-       int len;
-       int ch;
-       noise_word *next;
-       
-       memset (noise_words, 0, sizeof(noise_words));
-       
-       for (i=0; i<(sizeof(noise_words_init)/sizeof(char *)); ++i)
-       {
-               ch = noise_words_init[i][0] - 'a';
-               len = strlen(noise_words_init[i]);
-               
-               next = malloc(sizeof(noise_word));
-               next->len = len;
-               next->word = strdup(noise_words_init[i]);
-               next->next = noise_words[ch];
-               noise_words[ch] = next;
-       }
-}
-
-
-void noise_word_cleanup(void)
-{
-       int i;
-       noise_word *cur, *next;
-       
-       CtdlLogPrintf(CTDL_INFO, "Cleaning up fulltext noise words.\n");
-       
-       for (i = 0 ; i < 26 ; i++)
-       {
-               cur = noise_words[i];
-               while (cur)
-               {
-                       next = cur->next;
-                       free(cur->word);
-                       free(cur);
-                       cur = next;
-               }
-       }
-}
-
  /*
   * Compare function
   */
@@ -176,22 +126,20 @@ int intcmp(const void *rec1, const void *rec2) {
  }
  
  
-void wordbreaker(char *text, int *num_tokens, int **tokens) {
+void wordbreaker(const char *text, int *num_tokens, int **tokens) {
  
         int wb_num_tokens = 0;
         int wb_num_alloc = 0;
         int *wb_tokens = NULL;
  
-       char *ptr;
-       char *word_start;
-       char *word_end;
+       const char *ptr;
+       const char *word_start;
+       const char *word_end;
         char ch;
         int word_len;
         char word[256];
         int i;
         int word_crc;
-       noise_word *noise;
-       
         
         if (text == NULL) {             /* no NULL text please */
                 *num_tokens = 0;
@@ -218,12 +166,11 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                 ch = *ptr;
                 if ( (!isalnum(ch)) && (word_start) ) {
                         word_end = ptr;
-//                     --word_end;
  
                         /* extract the word */
                         word_len = word_end - word_start;
                         if (word_len >= sizeof word) {
-                               CtdlLogPrintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
+                               syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
                                 safestrncpy(word, word_start, sizeof word);
                                 word[(sizeof word) - 1] = 0;
                         }
@@ -234,30 +181,22 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                         word_start = NULL;
  
                         /* are we ok with the length? */
-                       if ( (word_len >= WB_MIN)
-                          && (word_len <= WB_MAX) ) {
+                       if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
                                 for (i=0; i<word_len; ++i) {
                                         word[i] = tolower(word[i]);
                                 }
                                 /* disqualify noise words */
-                               noise = noise_words[(int) (word[0]-'a')];
-                               while (noise)
-                               {
-                                       if (noise->len == word_len)
-                                       {
-                                               if (!strcmp(word, noise->word)) 
-                                               {
-                                                       word_len = 0;
-                                                       break;
-                                               }
+                               for (i=0; i<NUM_NOISE; ++i) {
+                                       if (!strcmp(word, noise_words[i])) {
+                                               word_len = 0;
+                                               break;
                                         }
-                                       noise = noise->next;
                                 }
+
                                 if (word_len == 0)
                                         continue;
  
-                               word_crc = (int)
-                                       CalcCRC16Bytes(word_len, word);
+                               word_crc = (int) CalcCRC16Bytes(word_len, word);
  
                                 ++wb_num_tokens;
                                 if (wb_num_tokens > wb_num_alloc) {