serv_fulltext: wordbreaker now returns a libcitadel Array

author Art Cancro <ajc@citadel.org>

Tue, 29 Aug 2023 21:12:04 +0000 (17:12 -0400)

committer Art Cancro <ajc@citadel.org>

Tue, 29 Aug 2023 21:12:04 +0000 (17:12 -0400)
author Art Cancro <ajc@citadel.org>
Tue, 29 Aug 2023 21:12:04 +0000 (17:12 -0400)
committer Art Cancro <ajc@citadel.org>
Tue, 29 Aug 2023 21:12:04 +0000 (17:12 -0400)
diff --git a/citadel/server/modules/fulltext/ft_wordbreaker.c b/citadel/server/modules/fulltext/ft_wordbreaker.c

index 81e605626291804b10c27f66be4c8e9f153aaafd..d8a11e55cfd437ddfab4a104a13609d3862190bf 100644 (file)
--- a/citadel/server/modules/fulltext/ft_wordbreaker.c
+++ b/citadel/server/modules/fulltext/ft_wordbreaker.c
@@ -126,12 +126,7 @@ int intcmp(const void *rec1, const void *rec2) {
  }
  
  
-void wordbreaker(const char *text, int *num_tokens, int **tokens) {
-
-       int wb_num_tokens = 0;
-       int wb_num_alloc = 0;
-       int *wb_tokens = NULL;
-
+Array *wordbreaker(const char *text) {
         const char *ptr;
         const char *word_start;
         const char *word_end;
@@ -140,17 +135,18 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) {
         char word[256];
         int i;
         int word_crc;
-       
+
         if (text == NULL) {             /* no NULL text please */
-               *num_tokens = 0;
-               *tokens = NULL;
-               return;
+               return(NULL);
         }
  
         if (text[0] == 0) {             /* no empty text either */
-               *num_tokens = 0;
-               *tokens = NULL;
-               return;
+               return(NULL);
+       }
+
+       Array *found_tokens = array_new(sizeof(int));
+       if (found_tokens == NULL) {
+               return(NULL);
         }
  
         ptr = text;
@@ -187,41 +183,31 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) {
                                 }
                                 /* disqualify noise words */
                                 for (i=0; i<NUM_NOISE; ++i) {
-                                       if (!strcmp(word, noise_words[i])) {
+                                       if (!strcasecmp(word, noise_words[i])) {
                                                 word_len = 0;
                                                 break;
                                         }
                                 }
-
-                               if (word_len == 0)
-                                       continue;
-
-                               word_crc = (int) CalcCRC16Bytes(word_len, word);
-
-                               ++wb_num_tokens;
-                               if (wb_num_tokens > wb_num_alloc) {
-                                       wb_num_alloc += 512;
-                                       wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
+                               /* FIXME make this case insensitive */
+                               /* add it to the array */
+                               if (word_len > 0) {
+                                       word_crc = (int) CalcCRC16Bytes(word_len, word);
+                                       array_append(found_tokens, &word_crc);
                                 }
-                               wb_tokens[wb_num_tokens - 1] = word_crc;
                         }
                 }
         }
  
         /* sort and purge dups */
-       if (wb_num_tokens > 1) {
-               qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
-               for (i=0; i<(wb_num_tokens-1); ++i) {
-                       if (wb_tokens[i] == wb_tokens[i+1]) {
-                               memmove(&wb_tokens[i], &wb_tokens[i+1],
-                                       ((wb_num_tokens - i - 1)*sizeof(int)));
-                               --wb_num_tokens;
+       if (array_len(found_tokens) > 1) {
+               array_sort(found_tokens, intcmp);
+               for (i=0; i<(array_len(found_tokens)); ++i) {
+                       if (array_get_element_at(found_tokens, i) == array_get_element_at(found_tokens, i+1)) {
+                               array_delete_element_at(found_tokens, i);
                                 --i;
                         }
                 }
         }
-
-       *num_tokens = wb_num_tokens;
-       *tokens = wb_tokens;
+       return(found_tokens);
  }
  
diff --git a/citadel/server/modules/fulltext/ft_wordbreaker.h b/citadel/server/modules/fulltext/ft_wordbreaker.h

index e36d6951e2673660155b6aa634e7e58a6f0d6b41..46767215adf37796d4362629f7d6a47623dcb82c 100644 (file)
--- a/citadel/server/modules/fulltext/ft_wordbreaker.h
+++ b/citadel/server/modules/fulltext/ft_wordbreaker.h
@@ -23,4 +23,4 @@
  #define WB_MIN                 4       // nothing with 3 or less chars
  #define WB_MAX                 40
  
-void wordbreaker(const char *text, int *num_tokens, int **tokens);
+Array *wordbreaker(const char *text);
diff --git a/citadel/server/modules/fulltext/serv_fulltext.c b/citadel/server/modules/fulltext/serv_fulltext.c

index 4f04baad649f300a9fca3d1b4d5f331b4171f480..c71232e082b123a2c2849f8b5296631253fe19e1 100644 (file)
--- a/citadel/server/modules/fulltext/serv_fulltext.c
+++ b/citadel/server/modules/fulltext/serv_fulltext.c
@@ -80,9 +80,8 @@ void ft_flush_cache(void) {
  
  // Index or de-index a message.  (op == 1 to index, 0 to de-index)
  void ft_index_message(long msgnum, int op) {
-       int num_tokens = 0;
-       int *tokens = NULL;
         int i, j;
+       Array *t = NULL;
         struct cdbdata cdb_bucket;
         StrBuf *msgtext;
         char *txt;
@@ -114,17 +113,17 @@ void ft_index_message(long msgnum, int op) {
                 syslog(LOG_DEBUG, "fulltext: wordbreaking message %ld (%d bytes)", msgnum, StrLength(msgtext));
         }
         txt = SmashStrBuf(&msgtext);
-       wordbreaker(txt, &num_tokens, &tokens);
+       t = wordbreaker(txt);
         free(txt);
  
-       syslog(LOG_DEBUG, "fulltext: %sindexing message %ld [%d tokens]", (op ? "" : "de"), msgnum, num_tokens);
-       if (num_tokens > 0) {
-               for (i=0; i<num_tokens; ++i) {
+       syslog(LOG_DEBUG, "fulltext: %sindexing message %ld [%d tokens]", (op ? "" : "de"), msgnum, array_len(t));
+       if (array_len(t) > 0) {
+               for (i=0; i<array_len(t); ++i) {
  
                         // Add the message to the relevant token bucket
  
                         // search for tokens[i]
-                       tok = tokens[i];
+                       memcpy(&tok, array_get_element_at(t, i), sizeof(int));
  
                         if ( (tok >= 0) && (tok <= 65535) ) {
                                 // fetch the bucket, Liza
@@ -164,7 +163,7 @@ void ft_index_message(long msgnum, int op) {
                         }
                 }
  
-               free(tokens);
+               array_free(t);
         }
  }
  
@@ -291,8 +290,7 @@ void do_fulltext_indexing(void) {
  // (This one does the "all of these words" search.)
  // Caller is responsible for freeing the message list.
  void ft_search(int *fts_num_msgs, long **fts_msgs, const char *search_string) {
-       int num_tokens = 0;
-       int *tokens = NULL;
+       Array *t = NULL;
         int i, j;
         struct cdbdata cdb_bucket;
         int num_all_msgs = 0;
@@ -302,12 +300,12 @@ void ft_search(int *fts_num_msgs, long **fts_msgs, const char *search_string) {
         long *ret_msgs = NULL;
         int tok;
  
-       wordbreaker(search_string, &num_tokens, &tokens);
-       if (num_tokens > 0) {
-               for (i=0; i<num_tokens; ++i) {
+       t = wordbreaker(search_string);
+       if (array_len(t) > 0) {
+               for (i=0; i<array_len(t); ++i) {
  
                         // search for tokens[i]
-                       tok = tokens[i];
+                       memcpy(&tok, array_get_element_at(t, i), sizeof(int));
  
                         // fetch the bucket, Liza
                         if (ftc_msgs[tok] == NULL) {
@@ -330,15 +328,15 @@ void ft_search(int *fts_num_msgs, long **fts_msgs, const char *search_string) {
                         }
  
                 }
-               free(tokens);
+               array_free(t);
                 if (all_msgs != NULL) {
                         qsort(all_msgs, num_all_msgs, sizeof(long), longcmp);
  
-                       // At this point, if a message appears num_tokens times in the
+                       // At this point, if a message appears array_len(t) times in the
                         // list, then it contains all of the search tokens.
-                       if (num_all_msgs >= num_tokens)
-                               for (j=0; j<(num_all_msgs-num_tokens+1); ++j) {
-                                       if (all_msgs[j] == all_msgs[j+num_tokens-1]) {
+                       if (num_all_msgs >= array_len(t))
+                               for (j=0; j<(num_all_msgs-array_len(t)+1); ++j) {
+                                       if (all_msgs[j] == all_msgs[j+array_len(t)-1]) {
                                                 ++num_ret_msgs;
                                                 if (num_ret_msgs > num_ret_alloc) {
                                                         num_ret_alloc += 64;
author	Art Cancro <ajc@citadel.org>
	Tue, 29 Aug 2023 21:12:04 +0000 (17:12 -0400)
committer	Art Cancro <ajc@citadel.org>
	Tue, 29 Aug 2023 21:12:04 +0000 (17:12 -0400)
citadel/server/modules/fulltext/ft_wordbreaker.c		patch \| blob \| history
citadel/server/modules/fulltext/ft_wordbreaker.h		patch \| blob \| history
citadel/server/modules/fulltext/serv_fulltext.c		patch \| blob \| history