From: Art Cancro Date: Tue, 29 Aug 2023 21:12:04 +0000 (-0400) Subject: serv_fulltext: wordbreaker now returns a libcitadel Array X-Git-Tag: v989~15 X-Git-Url: https://code.citadel.org/?a=commitdiff_plain;h=300908107a924d03fe5f216976b561125ef7271c;p=citadel.git serv_fulltext: wordbreaker now returns a libcitadel Array --- diff --git a/citadel/server/modules/fulltext/ft_wordbreaker.c b/citadel/server/modules/fulltext/ft_wordbreaker.c index 81e605626..d8a11e55c 100644 --- a/citadel/server/modules/fulltext/ft_wordbreaker.c +++ b/citadel/server/modules/fulltext/ft_wordbreaker.c @@ -126,12 +126,7 @@ int intcmp(const void *rec1, const void *rec2) { } -void wordbreaker(const char *text, int *num_tokens, int **tokens) { - - int wb_num_tokens = 0; - int wb_num_alloc = 0; - int *wb_tokens = NULL; - +Array *wordbreaker(const char *text) { const char *ptr; const char *word_start; const char *word_end; @@ -140,17 +135,18 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) { char word[256]; int i; int word_crc; - + if (text == NULL) { /* no NULL text please */ - *num_tokens = 0; - *tokens = NULL; - return; + return(NULL); } if (text[0] == 0) { /* no empty text either */ - *num_tokens = 0; - *tokens = NULL; - return; + return(NULL); + } + + Array *found_tokens = array_new(sizeof(int)); + if (found_tokens == NULL) { + return(NULL); } ptr = text; @@ -187,41 +183,31 @@ void wordbreaker(const char *text, int *num_tokens, int **tokens) { } /* disqualify noise words */ for (i=0; i wb_num_alloc) { - wb_num_alloc += 512; - wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc)); + /* FIXME make this case insensitive */ + /* add it to the array */ + if (word_len > 0) { + word_crc = (int) CalcCRC16Bytes(word_len, word); + array_append(found_tokens, &word_crc); } - wb_tokens[wb_num_tokens - 1] = word_crc; } } } /* sort and purge dups */ - if (wb_num_tokens > 1) { - qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp); - for (i=0; i<(wb_num_tokens-1); ++i) { - if (wb_tokens[i] == wb_tokens[i+1]) { - memmove(&wb_tokens[i], &wb_tokens[i+1], - ((wb_num_tokens - i - 1)*sizeof(int))); - --wb_num_tokens; + if (array_len(found_tokens) > 1) { + array_sort(found_tokens, intcmp); + for (i=0; i<(array_len(found_tokens)); ++i) { + if (array_get_element_at(found_tokens, i) == array_get_element_at(found_tokens, i+1)) { + array_delete_element_at(found_tokens, i); --i; } } } - - *num_tokens = wb_num_tokens; - *tokens = wb_tokens; + return(found_tokens); } diff --git a/citadel/server/modules/fulltext/ft_wordbreaker.h b/citadel/server/modules/fulltext/ft_wordbreaker.h index e36d6951e..46767215a 100644 --- a/citadel/server/modules/fulltext/ft_wordbreaker.h +++ b/citadel/server/modules/fulltext/ft_wordbreaker.h @@ -23,4 +23,4 @@ #define WB_MIN 4 // nothing with 3 or less chars #define WB_MAX 40 -void wordbreaker(const char *text, int *num_tokens, int **tokens); +Array *wordbreaker(const char *text); diff --git a/citadel/server/modules/fulltext/serv_fulltext.c b/citadel/server/modules/fulltext/serv_fulltext.c index 4f04baad6..c71232e08 100644 --- a/citadel/server/modules/fulltext/serv_fulltext.c +++ b/citadel/server/modules/fulltext/serv_fulltext.c @@ -80,9 +80,8 @@ void ft_flush_cache(void) { // Index or de-index a message. (op == 1 to index, 0 to de-index) void ft_index_message(long msgnum, int op) { - int num_tokens = 0; - int *tokens = NULL; int i, j; + Array *t = NULL; struct cdbdata cdb_bucket; StrBuf *msgtext; char *txt; @@ -114,17 +113,17 @@ void ft_index_message(long msgnum, int op) { syslog(LOG_DEBUG, "fulltext: wordbreaking message %ld (%d bytes)", msgnum, StrLength(msgtext)); } txt = SmashStrBuf(&msgtext); - wordbreaker(txt, &num_tokens, &tokens); + t = wordbreaker(txt); free(txt); - syslog(LOG_DEBUG, "fulltext: %sindexing message %ld [%d tokens]", (op ? "" : "de"), msgnum, num_tokens); - if (num_tokens > 0) { - for (i=0; i 0) { + for (i=0; i= 0) && (tok <= 65535) ) { // fetch the bucket, Liza @@ -164,7 +163,7 @@ void ft_index_message(long msgnum, int op) { } } - free(tokens); + array_free(t); } } @@ -291,8 +290,7 @@ void do_fulltext_indexing(void) { // (This one does the "all of these words" search.) // Caller is responsible for freeing the message list. void ft_search(int *fts_num_msgs, long **fts_msgs, const char *search_string) { - int num_tokens = 0; - int *tokens = NULL; + Array *t = NULL; int i, j; struct cdbdata cdb_bucket; int num_all_msgs = 0; @@ -302,12 +300,12 @@ void ft_search(int *fts_num_msgs, long **fts_msgs, const char *search_string) { long *ret_msgs = NULL; int tok; - wordbreaker(search_string, &num_tokens, &tokens); - if (num_tokens > 0) { - for (i=0; i 0) { + for (i=0; i= num_tokens) - for (j=0; j<(num_all_msgs-num_tokens+1); ++j) { - if (all_msgs[j] == all_msgs[j+num_tokens-1]) { + if (num_all_msgs >= array_len(t)) + for (j=0; j<(num_all_msgs-array_len(t)+1); ++j) { + if (all_msgs[j] == all_msgs[j+array_len(t)-1]) { ++num_ret_msgs; if (num_ret_msgs > num_ret_alloc) { num_ret_alloc += 64;