]> code.citadel.org Git - citadel.git/blobdiff - citadel/server/modules/fulltext/ft_wordbreaker.c
misc style cleanup
[citadel.git] / citadel / server / modules / fulltext / ft_wordbreaker.c
index 81e605626291804b10c27f66be4c8e9f153aaafd..5f510e82027c3d9a1dd124a9182be4681449f5cc 100644 (file)
@@ -1,16 +1,7 @@
-/*
- * Default wordbreaker module for full text indexing.
- *
- * Copyright (c) 2005-2017 by the citadel.org team
- *
- * This program is open source software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 3.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
+// Default wordbreaker module for full text indexing.
+//
+// Copyright (c) 2005-2024 by the citadel.org team
+// This program is open source software.  Use, duplication, or disclosure is subject to the GNU General Public License v3.
 
 #include "../../sysdep.h"
 #include <stdlib.h>
 #include "crc16.h"
 #include "../../ctdl_module.h"
 
-/*
- * Noise words are not included in search indices.
- * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
- * must also be changed, so that the index is rebuilt.
- */
+// Noise words are not included in search indices.
+// NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID must also be changed, so that the index is rebuilt.
 static char *noise_words[] = {
        "about",
        "after",
@@ -111,9 +99,7 @@ static char *noise_words[] = {
 #define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
 
 
-/*
- * Compare function
- */
+// Compare function
 int intcmp(const void *rec1, const void *rec2) {
        int i1, i2;
 
@@ -126,12 +112,7 @@ int intcmp(const void *rec1, const void *rec2) {
 }
 
 
-void wordbreaker(const char *text, int *num_tokens, int **tokens) {
-
-       int wb_num_tokens = 0;
-       int wb_num_alloc = 0;
-       int *wb_tokens = NULL;
-
+Array *wordbreaker(const char *text) {
        const char *ptr;
        const char *word_start;
        const char *word_end;
@@ -140,17 +121,18 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) {
        char word[256];
        int i;
        int word_crc;
-       
-       if (text == NULL) {             /* no NULL text please */
-               *num_tokens = 0;
-               *tokens = NULL;
-               return;
+
+       if (text == NULL) {             // no NULL text please
+               return(NULL);
+       }
+
+       if (text[0] == 0) {             // no empty text either
+               return(NULL);
        }
 
-       if (text[0] == 0) {             /* no empty text either */
-               *num_tokens = 0;
-               *tokens = NULL;
-               return;
+       Array *found_tokens = array_new(sizeof(int));
+       if (found_tokens == NULL) {
+               return(NULL);
        }
 
        ptr = text;
@@ -167,7 +149,7 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) {
                if ( (!isalnum(ch)) && (word_start) ) {
                        word_end = ptr;
 
-                       /* extract the word */
+                       // extract the word
                        word_len = word_end - word_start;
                        if (word_len >= sizeof word) {
                                syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
@@ -180,48 +162,36 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) {
                        }
                        word_start = NULL;
 
-                       /* are we ok with the length? */
+                       // are we ok with the length?
                        if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
                                for (i=0; i<word_len; ++i) {
                                        word[i] = tolower(word[i]);
                                }
-                               /* disqualify noise words */
+                               // disqualify noise words
                                for (i=0; i<NUM_NOISE; ++i) {
-                                       if (!strcmp(word, noise_words[i])) {
+                                       if (!strcasecmp(word, noise_words[i])) {
                                                word_len = 0;
                                                break;
                                        }
                                }
-
-                               if (word_len == 0)
-                                       continue;
-
-                               word_crc = (int) CalcCRC16Bytes(word_len, word);
-
-                               ++wb_num_tokens;
-                               if (wb_num_tokens > wb_num_alloc) {
-                                       wb_num_alloc += 512;
-                                       wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
+                               // add it to the array (FIXME make this case insensitive)
+                               if (word_len > 0) {
+                                       word_crc = (int) CalcCRC16Bytes(word_len, word);
+                                       array_append(found_tokens, &word_crc);
                                }
-                               wb_tokens[wb_num_tokens - 1] = word_crc;
                        }
                }
        }
 
-       /* sort and purge dups */
-       if (wb_num_tokens > 1) {
-               qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
-               for (i=0; i<(wb_num_tokens-1); ++i) {
-                       if (wb_tokens[i] == wb_tokens[i+1]) {
-                               memmove(&wb_tokens[i], &wb_tokens[i+1],
-                                       ((wb_num_tokens - i - 1)*sizeof(int)));
-                               --wb_num_tokens;
+       // sort and purge dups
+       if (array_len(found_tokens) > 1) {
+               array_sort(found_tokens, intcmp);
+               for (i=0; i<(array_len(found_tokens)); ++i) {
+                       if (array_get_element_at(found_tokens, i) == array_get_element_at(found_tokens, i+1)) {
+                               array_delete_element_at(found_tokens, i);
                                --i;
                        }
                }
        }
-
-       *num_tokens = wb_num_tokens;
-       *tokens = wb_tokens;
+       return(found_tokens);
 }
-