misc style cleanup

[citadel.git] / citadel / server / modules / fulltext / ft_wordbreaker.c
diff --git a/citadel/server/modules/fulltext/ft_wordbreaker.c b/citadel/server/modules/fulltext/ft_wordbreaker.c

index 81e605626291804b10c27f66be4c8e9f153aaafd..5f510e82027c3d9a1dd124a9182be4681449f5cc 100644 (file)
--- a/citadel/server/modules/fulltext/ft_wordbreaker.c
+++ b/citadel/server/modules/fulltext/ft_wordbreaker.c
@@ -1,16 +1,7 @@
-/*
- * Default wordbreaker module for full text indexing.
- *
- * Copyright (c) 2005-2017 by the citadel.org team
- *
- * This program is open source software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 3.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// Default wordbreaker module for full text indexing.
+//
+// Copyright (c) 2005-2024 by the citadel.org team
+// This program is open source software.  Use, duplication, or disclosure is subject to the GNU General Public License v3.
  
  #include "../../sysdep.h"
  #include <stdlib.h>
@@ -40,11 +31,8 @@
  #include "crc16.h"
  #include "../../ctdl_module.h"
  
-/*
- * Noise words are not included in search indices.
- * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
- * must also be changed, so that the index is rebuilt.
- */
+// Noise words are not included in search indices.
+// NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID must also be changed, so that the index is rebuilt.
  static char *noise_words[] = {
         "about",
         "after",
@@ -111,9 +99,7 @@ static char *noise_words[] = {
  #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
  
  
-/*
- * Compare function
- */
+// Compare function
  int intcmp(const void *rec1, const void *rec2) {
         int i1, i2;
  
@@ -126,12 +112,7 @@ int intcmp(const void *rec1, const void *rec2) {
  }
  
  
-void wordbreaker(const char *text, int *num_tokens, int **tokens) {
-
-       int wb_num_tokens = 0;
-       int wb_num_alloc = 0;
-       int *wb_tokens = NULL;
-
+Array *wordbreaker(const char *text) {
         const char *ptr;
         const char *word_start;
         const char *word_end;
@@ -140,17 +121,18 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) {
         char word[256];
         int i;
         int word_crc;
-       
-       if (text == NULL) {             /* no NULL text please */
-               *num_tokens = 0;
-               *tokens = NULL;
-               return;
+
+       if (text == NULL) {             // no NULL text please
+               return(NULL);
+       }
+
+       if (text[0] == 0) {             // no empty text either
+               return(NULL);
         }
  
-       if (text[0] == 0) {             /* no empty text either */
-               *num_tokens = 0;
-               *tokens = NULL;
-               return;
+       Array *found_tokens = array_new(sizeof(int));
+       if (found_tokens == NULL) {
+               return(NULL);
         }
  
         ptr = text;
@@ -167,7 +149,7 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) {
                 if ( (!isalnum(ch)) && (word_start) ) {
                         word_end = ptr;
  
-                       /* extract the word */
+                       // extract the word
                         word_len = word_end - word_start;
                         if (word_len >= sizeof word) {
                                 syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
@@ -180,48 +162,36 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) {
                         }
                         word_start = NULL;
  
-                       /* are we ok with the length? */
+                       // are we ok with the length?
                         if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
                                 for (i=0; i<word_len; ++i) {
                                         word[i] = tolower(word[i]);
                                 }
-                               /* disqualify noise words */
+                               // disqualify noise words
                                 for (i=0; i<NUM_NOISE; ++i) {
-                                       if (!strcmp(word, noise_words[i])) {
+                                       if (!strcasecmp(word, noise_words[i])) {
                                                 word_len = 0;
                                                 break;
                                         }
                                 }
-
-                               if (word_len == 0)
-                                       continue;
-
-                               word_crc = (int) CalcCRC16Bytes(word_len, word);
-
-                               ++wb_num_tokens;
-                               if (wb_num_tokens > wb_num_alloc) {
-                                       wb_num_alloc += 512;
-                                       wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
+                               // add it to the array (FIXME make this case insensitive)
+                               if (word_len > 0) {
+                                       word_crc = (int) CalcCRC16Bytes(word_len, word);
+                                       array_append(found_tokens, &word_crc);
                                 }
-                               wb_tokens[wb_num_tokens - 1] = word_crc;
                         }
                 }
         }
  
-       /* sort and purge dups */
-       if (wb_num_tokens > 1) {
-               qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
-               for (i=0; i<(wb_num_tokens-1); ++i) {
-                       if (wb_tokens[i] == wb_tokens[i+1]) {
-                               memmove(&wb_tokens[i], &wb_tokens[i+1],
-                                       ((wb_num_tokens - i - 1)*sizeof(int)));
-                               --wb_num_tokens;
+       // sort and purge dups
+       if (array_len(found_tokens) > 1) {
+               array_sort(found_tokens, intcmp);
+               for (i=0; i<(array_len(found_tokens)); ++i) {
+                       if (array_get_element_at(found_tokens, i) == array_get_element_at(found_tokens, i+1)) {
+                               array_delete_element_at(found_tokens, i);
                                 --i;
                         }
                 }
         }
-
-       *num_tokens = wb_num_tokens;
-       *tokens = wb_tokens;
+       return(found_tokens);
  }
-