]> code.citadel.org Git - citadel.git/commitdiff
* Performance-optimized the full text indexer.
authorArt Cancro <ajc@citadel.org>
Fri, 20 May 2005 02:37:17 +0000 (02:37 +0000)
committerArt Cancro <ajc@citadel.org>
Fri, 20 May 2005 02:37:17 +0000 (02:37 +0000)
citadel/ChangeLog
citadel/ft_wordbreaker.c
citadel/ft_wordbreaker.h
citadel/ft_wordbreaker.o
citadel/serv_fulltext.c

index 5d5b1f0e7ecc08c7b728b53f0e4733e7e4fd5033..4d13a37e4de641e95e6e6b2f55a871d9b3629d06 100644 (file)
@@ -1,4 +1,7 @@
  $Log$
+ Revision 647.14  2005/05/20 02:37:17  ajc
+ * Performance-optimized the full text indexer.
+
  Revision 647.13  2005/05/20 01:20:24  ajc
  * Cull logs immediately after a successful db checkpoint instead of only
    once every 24 hours.  During big db write operations (such as building
@@ -6736,4 +6739,3 @@ Sat Jul 11 00:20:48 EDT 1998 Nathan Bryant <bryant@cs.usm.maine.edu>
 
 Fri Jul 10 1998 Art Cancro <ajc@uncensored.citadel.org>
        * Initial CVS import
-
index 0386eb6516eaa091079c5ed9c350bc330838ecb6..e45fd3641a640b2a04dc30d90cb887098efd0855 100644 (file)
@@ -135,6 +135,7 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
                                memmove(&wb_tokens[i], &wb_tokens[i+1],
                                        ((wb_num_tokens - i - 1)*sizeof(int)));
                                --wb_num_tokens;
+                               --i;
                        }
                }
        }
index caa5d56c0ac036f36948fa43b7113898e57389b8..3a14e86ea5620c0a3b56cb2ba894f08c133a376c 100644 (file)
@@ -9,7 +9,7 @@
  * later on, or even if we update this one, we can use a different ID so the
  * system knows it needs to throw away the existing index and rebuild it.
  */
-#define        FT_WORDBREAKER_ID       0x0011
+#define        FT_WORDBREAKER_ID       0x0019
 
 /*
  * Minimum and maximum length of words to index
index 2eb81eaf1a22e465f415ff134abf0b625efff22e..7bc264ccc2132d725395c270a87152d0ca5bf90f 100644 (file)
Binary files a/citadel/ft_wordbreaker.o and b/citadel/ft_wordbreaker.o differ
index cd8551b0789351b65340b2aa5bd5a867c45aeb90..d88f50f5e6ac28941af600b6555555176825a956 100644 (file)
@@ -88,7 +88,6 @@ void ft_index_message(long msgnum, int op) {
        /* Output the message as text before indexing it, so we don't end up
         * indexing a bunch of encoded base64, etc.
         */
-       lprintf(CTDL_DEBUG, "Fetching...\n");
        CC->redirect_buffer = malloc(SIZ);
        CC->redirect_len = 0;
        CC->redirect_alloc = SIZ;
@@ -97,11 +96,11 @@ void ft_index_message(long msgnum, int op) {
        CC->redirect_buffer = NULL;
        CC->redirect_len = 0;
        CC->redirect_alloc = 0;
-       lprintf(CTDL_DEBUG, "Wordbreaking...\n");
+       lprintf(CTDL_DEBUG, "Wordbreaking message %ld...\n", msgnum);
        wordbreaker(msgtext, &num_tokens, &tokens);
        free(msgtext);
 
-       lprintf(CTDL_DEBUG, "Indexing...\n");
+       lprintf(CTDL_DEBUG, "Indexing message %ld...\n", msgnum);
        if (num_tokens > 0) {
                for (i=0; i<num_tokens; ++i) {
 
@@ -133,12 +132,20 @@ void ft_index_message(long msgnum, int op) {
                                                if (msgs[j] == msgnum) {
                                                        memmove(&msgs[j], &msgs[j+1], ((num_msgs - j - 1)*sizeof(long)));
                                                        --num_msgs;
+                                                       --j;
                                                }
                                        }
                                }
                        }
 
-                       /* sort and purge dups */
+                       /* sort and purge dups 
+                        *
+                        * This whole section is commented out because it's
+                        * no longer needed -- since the tokenizer already
+                        * does a merge/purge on the tokens it returns, and
+                        * we're guaranteed to always be indexing a message
+                        * with a number higher than any already in the index.
+                        * 
                        if ( (op == 1) && (num_msgs > 1) ) {
                                msgs = (long *) cdb_bucket->ptr;
                                qsort(msgs, num_msgs, sizeof(long), longcmp);
@@ -146,9 +153,11 @@ void ft_index_message(long msgnum, int op) {
                                        if (msgs[j] == msgs[j+1]) {
                                                memmove(&msgs[j], &msgs[j+1], ((num_msgs - j - 1)*sizeof(long)));
                                                --num_msgs;
+                                               --j;
                                        }
                                }
                        }
+                       */
 
                        cdb_store(CDB_FULLTEXT, &tokens[i], sizeof(int),
                                msgs, (num_msgs*sizeof(long)) );
@@ -241,6 +250,7 @@ void do_fulltext_indexing(void) {
                                memmove(&ft_newmsgs[i], &ft_newmsgs[i+1],
                                        ((ft_num_msgs - i - 1)*sizeof(long)));
                                --ft_num_msgs;
+                               --i;
                        }
                }