From 2e76d1431d857054387a4f22b74a96c86e80f958 Mon Sep 17 00:00:00 2001
From: Dave West <davew@uncensored.citadel.org>
Date: Tue, 13 Nov 2007 02:37:45 +0000
Subject: [PATCH] A little bit of speed up in the wordbreaker for full text
 indexing. Also fixed a small bug that made some of our index tokens bad.
 Maybe this will fix some of the text search problems some people are seeing.
 We also need to alter the indexer to index the message headers as well as the
 body, at the moment we only index the body.

---
 citadel/modules/fulltext/ft_wordbreaker.c | 71 +++++------------------
 citadel/modules/fulltext/ft_wordbreaker.h |  4 +-
 citadel/modules/fulltext/serv_fulltext.c  | 15 +++--
 3 files changed, 27 insertions(+), 63 deletions(-)

diff --git a/citadel/modules/fulltext/ft_wordbreaker.c b/citadel/modules/fulltext/ft_wordbreaker.c
index 6b9fb2d24..12b0f667c 100644
--- a/citadel/modules/fulltext/ft_wordbreaker.c
+++ b/citadel/modules/fulltext/ft_wordbreaker.c
@@ -52,73 +52,37 @@
 static char *noise_words[] = {
 	"about",
 	"after",
-	"all",
 	"also",
-	"an",
-	"and",
 	"another",
-	"any",
-	"are",
-	"as",
-	"at",
-	"be",
 	"because",
 	"been",
 	"before",
 	"being",
 	"between",
 	"both",
-	"but",
-	"by",
 	"came",
-	"can",
 	"come",
 	"could",
-	"did",
-	"do",
 	"each",
-	"for",
 	"from",
-	"get",
-	"got",
-	"had",
-	"has",
 	"have",
-	"he",
-	"her",
 	"here",
-	"him",
 	"himself",
-	"his",
-	"how",
-	"if",
-	"in",
 	"into",
-	"is",
-	"it",
 	"like",
 	"make",
 	"many",
-	"me",
 	"might",
 	"more",
 	"most",
 	"much",
 	"must",
-	"my",
 	"never",
-	"now",
-	"of",
-	"on",
 	"only",
-	"or",
 	"other",
-	"our",
-	"out",
 	"over",
 	"said",
 	"same",
-	"see",
 	"should",
 	"since",
 	"some",
@@ -127,7 +91,6 @@ static char *noise_words[] = {
 	"take",
 	"than",
 	"that",
-	"the",
 	"their",
 	"them",
 	"then",
@@ -137,14 +100,8 @@ static char *noise_words[] = {
 	"this",
 	"those",
 	"through",
-	"to",
-	"too",
 	"under",
-	"up",
 	"very",
-	"was",
-	"way",
-	"we",
 	"well",
 	"were",
 	"what",
@@ -153,7 +110,6 @@ static char *noise_words[] = {
 	"while",
 	"with",
 	"would",
-	"you",
 	"your"
 };
 
@@ -212,34 +168,37 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
 		ch = *ptr;
 		if ( (!isalnum(ch)) && (word_start) ) {
 			word_end = ptr;
-			--word_end;
+//			--word_end;
 
 			/* extract the word */
-			word_len = word_end - word_start + 1;
-			safestrncpy(word, word_start, sizeof word);
+			word_len = word_end - word_start;
 			if (word_len >= sizeof word) {
 				lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
-				word[(sizeof word_len) - 1] = 0;
+				safestrncpy(word, word_start, sizeof word);
+				word[(sizeof word) - 1] = 0;
 			}
 			else {
+				safestrncpy(word, word_start, word_len+1);
 				word[word_len] = 0;
 			}
 			word_start = NULL;
 
-			/* disqualify noise words */
-			for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
-				if (!strcasecmp(word, noise_words[i])) {
-					word_len = 0;
-					break;
-				}
-			}
-
 			/* are we ok with the length? */
 			if ( (word_len >= WB_MIN)
 			   && (word_len <= WB_MAX) ) {
 				for (i=0; i<word_len; ++i) {
 					word[i] = tolower(word[i]);
 				}
+				/* disqualify noise words */
+				for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
+					if (!strcmp(word, noise_words[i])) {
+						word_len = 0;
+						break;
+					}
+				}
+				if (word_len == 0)
+					continue;
+
 				word_crc = (int)
 					CalcCRC16Bytes(word_len, word);
 
diff --git a/citadel/modules/fulltext/ft_wordbreaker.h b/citadel/modules/fulltext/ft_wordbreaker.h
index 5f1fb99fe..16f714560 100644
--- a/citadel/modules/fulltext/ft_wordbreaker.h
+++ b/citadel/modules/fulltext/ft_wordbreaker.h
@@ -9,12 +9,12 @@
  * later on, or even if we update this one, we can use a different ID so the
  * system knows it needs to throw away the existing index and rebuild it.
  */
-#define	FT_WORDBREAKER_ID	0x001f
+#define	FT_WORDBREAKER_ID	0x0021
 
 /*
  * Minimum and maximum length of words to index
  */
-#define WB_MIN			3
+#define WB_MIN			4	// nothing with 3 or less chars
 #define WB_MAX			40
 
 void wordbreaker(char *text, int *num_tokens, int **tokens);
diff --git a/citadel/modules/fulltext/serv_fulltext.c b/citadel/modules/fulltext/serv_fulltext.c
index ba3f948a0..4e12ead4a 100644
--- a/citadel/modules/fulltext/serv_fulltext.c
+++ b/citadel/modules/fulltext/serv_fulltext.c
@@ -219,7 +219,9 @@ void do_fulltext_indexing(void) {
 	int i;
 	static time_t last_index = 0L;
 	static time_t last_progress = 0L;
-
+	time_t run_time = 0L;
+	time_t end_time = 0L;
+	
 	/*
 	 * Don't do this if the site doesn't have it enabled.
 	 */
@@ -242,17 +244,18 @@ void do_fulltext_indexing(void) {
 	if (CitControl.MMfulltext >= CitControl.MMhighest) {
 		return;		/* nothing to do! */
 	}
-
-	lprintf(CTDL_DEBUG, "do_fulltext_indexing() started\n");
+	
+	run_time = time(NULL);
+	lprintf(CTDL_DEBUG, "do_fulltext_indexing() started (%ld)\n", run_time);
 	
 	/*
 	 * If we've switched wordbreaker modules, burn the index and start
 	 * over.
 	 */
 	begin_critical_section(S_CONTROL);
-	lprintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n",
-			CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID);
 	if (CitControl.fulltext_wordbreaker != FT_WORDBREAKER_ID) {
+		lprintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n",
+			CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID);
 		lprintf(CTDL_INFO, "(re)initializing full text index\n");
 		cdb_trunc(CDB_FULLTEXT);
 		CitControl.MMfulltext = 0L;
@@ -310,6 +313,8 @@ void do_fulltext_indexing(void) {
 		ft_num_alloc = 0;
 		ft_newmsgs = NULL;
 	}
+	end_time = time(NULL);
+	lprintf(CTDL_DEBUG, "do_fulltext_indexing() duration (%ld)\n", end_time - run_time);
 
 	/* Save our place so we don't have to do this again */
 	ft_flush_cache();
-- 
2.30.2