* Performance-optimized the full text indexer.

author Art Cancro <ajc@citadel.org>

Fri, 20 May 2005 02:37:17 +0000 (02:37 +0000)

committer Art Cancro <ajc@citadel.org>

Fri, 20 May 2005 02:37:17 +0000 (02:37 +0000)
author Art Cancro <ajc@citadel.org>
Fri, 20 May 2005 02:37:17 +0000 (02:37 +0000)
committer Art Cancro <ajc@citadel.org>
Fri, 20 May 2005 02:37:17 +0000 (02:37 +0000)
diff --git a/citadel/ChangeLog b/citadel/ChangeLog

index 5d5b1f0e7ecc08c7b728b53f0e4733e7e4fd5033..4d13a37e4de641e95e6e6b2f55a871d9b3629d06 100644 (file)
--- a/citadel/ChangeLog
+++ b/citadel/ChangeLog
@@ -1,4 +1,7 @@
   $Log$
+ Revision 647.14  2005/05/20 02:37:17  ajc
+ * Performance-optimized the full text indexer.
+
   Revision 647.13  2005/05/20 01:20:24  ajc
   * Cull logs immediately after a successful db checkpoint instead of only
     once every 24 hours.  During big db write operations (such as building
@@ -6736,4 +6739,3 @@ Sat Jul 11 00:20:48 EDT 1998 Nathan Bryant <bryant@cs.usm.maine.edu>
  
  Fri Jul 10 1998 Art Cancro <ajc@uncensored.citadel.org>
         * Initial CVS import
-
diff --git a/citadel/ft_wordbreaker.c b/citadel/ft_wordbreaker.c

index 0386eb6516eaa091079c5ed9c350bc330838ecb6..e45fd3641a640b2a04dc30d90cb887098efd0855 100644 (file)
--- a/citadel/ft_wordbreaker.c
+++ b/citadel/ft_wordbreaker.c
@@ -135,6 +135,7 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                                 memmove(&wb_tokens[i], &wb_tokens[i+1],
                                         ((wb_num_tokens - i - 1)*sizeof(int)));
                                 --wb_num_tokens;
+                               --i;
                         }
                 }
         }
diff --git a/citadel/ft_wordbreaker.h b/citadel/ft_wordbreaker.h

index caa5d56c0ac036f36948fa43b7113898e57389b8..3a14e86ea5620c0a3b56cb2ba894f08c133a376c 100644 (file)
--- a/citadel/ft_wordbreaker.h
+++ b/citadel/ft_wordbreaker.h
@@ -9,7 +9,7 @@
   * later on, or even if we update this one, we can use a different ID so the
   * system knows it needs to throw away the existing index and rebuild it.
   */
-#define        FT_WORDBREAKER_ID       0x0011
+#define        FT_WORDBREAKER_ID       0x0019
  
  /*
   * Minimum and maximum length of words to index
diff --git a/citadel/ft_wordbreaker.o b/citadel/ft_wordbreaker.o

index 2eb81eaf1a22e465f415ff134abf0b625efff22e..7bc264ccc2132d725395c270a87152d0ca5bf90f 100644 (file)

Binary files a/citadel/ft_wordbreaker.o and b/citadel/ft_wordbreaker.o differ
diff --git a/citadel/serv_fulltext.c b/citadel/serv_fulltext.c

index cd8551b0789351b65340b2aa5bd5a867c45aeb90..d88f50f5e6ac28941af600b6555555176825a956 100644 (file)
--- a/citadel/serv_fulltext.c
+++ b/citadel/serv_fulltext.c
@@ -88,7 +88,6 @@ void ft_index_message(long msgnum, int op) {
         /* Output the message as text before indexing it, so we don't end up
          * indexing a bunch of encoded base64, etc.
          */
-       lprintf(CTDL_DEBUG, "Fetching...\n");
         CC->redirect_buffer = malloc(SIZ);
         CC->redirect_len = 0;
         CC->redirect_alloc = SIZ;
@@ -97,11 +96,11 @@ void ft_index_message(long msgnum, int op) {
         CC->redirect_buffer = NULL;
         CC->redirect_len = 0;
         CC->redirect_alloc = 0;
-       lprintf(CTDL_DEBUG, "Wordbreaking...\n");
+       lprintf(CTDL_DEBUG, "Wordbreaking message %ld...\n", msgnum);
         wordbreaker(msgtext, &num_tokens, &tokens);
         free(msgtext);
  
-       lprintf(CTDL_DEBUG, "Indexing...\n");
+       lprintf(CTDL_DEBUG, "Indexing message %ld...\n", msgnum);
         if (num_tokens > 0) {
                 for (i=0; i<num_tokens; ++i) {
  
@@ -133,12 +132,20 @@ void ft_index_message(long msgnum, int op) {
                                                 if (msgs[j] == msgnum) {
                                                         memmove(&msgs[j], &msgs[j+1], ((num_msgs - j - 1)*sizeof(long)));
                                                         --num_msgs;
+                                                       --j;
                                                 }
                                         }
                                 }
                         }
  
-                       /* sort and purge dups */
+                       /* sort and purge dups 
+                        *
+                        * This whole section is commented out because it's
+                        * no longer needed -- since the tokenizer already
+                        * does a merge/purge on the tokens it returns, and
+                        * we're guaranteed to always be indexing a message
+                        * with a number higher than any already in the index.
+                        * 
                         if ( (op == 1) && (num_msgs > 1) ) {
                                 msgs = (long *) cdb_bucket->ptr;
                                 qsort(msgs, num_msgs, sizeof(long), longcmp);
@@ -146,9 +153,11 @@ void ft_index_message(long msgnum, int op) {
                                         if (msgs[j] == msgs[j+1]) {
                                                 memmove(&msgs[j], &msgs[j+1], ((num_msgs - j - 1)*sizeof(long)));
                                                 --num_msgs;
+                                               --j;
                                         }
                                 }
                         }
+                       */
  
                         cdb_store(CDB_FULLTEXT, &tokens[i], sizeof(int),
                                 msgs, (num_msgs*sizeof(long)) );
@@ -241,6 +250,7 @@ void do_fulltext_indexing(void) {
                                 memmove(&ft_newmsgs[i], &ft_newmsgs[i+1],
                                         ((ft_num_msgs - i - 1)*sizeof(long)));
                                 --ft_num_msgs;
+                               --i;
                         }
                 }
author	Art Cancro <ajc@citadel.org>
	Fri, 20 May 2005 02:37:17 +0000 (02:37 +0000)
committer	Art Cancro <ajc@citadel.org>
	Fri, 20 May 2005 02:37:17 +0000 (02:37 +0000)
citadel/ChangeLog		patch \| blob \| history
citadel/ft_wordbreaker.c		patch \| blob \| history
citadel/ft_wordbreaker.h		patch \| blob \| history
citadel/ft_wordbreaker.o		patch \| blob \| history
citadel/serv_fulltext.c		patch \| blob \| history