Also fixed a small bug that made some of our index tokens bad.
Maybe this will fix some of the text search problems some people are
seeing.
We also need to alter the indexer to index the message headers as well
as the body, at the moment we only index the body.
static char *noise_words[] = {
"about",
"after",
- "all",
"also",
- "an",
- "and",
"another",
- "any",
- "are",
- "as",
- "at",
- "be",
"because",
"been",
"before",
"being",
"between",
"both",
- "but",
- "by",
"came",
- "can",
"come",
"could",
- "did",
- "do",
"each",
- "for",
"from",
- "get",
- "got",
- "had",
- "has",
"have",
- "he",
- "her",
"here",
- "him",
"himself",
- "his",
- "how",
- "if",
- "in",
"into",
- "is",
- "it",
"like",
"make",
"many",
- "me",
"might",
"more",
"most",
"much",
"must",
- "my",
"never",
- "now",
- "of",
- "on",
"only",
- "or",
"other",
- "our",
- "out",
"over",
"said",
"same",
- "see",
"should",
"since",
"some",
"take",
"than",
"that",
- "the",
"their",
"them",
"then",
"this",
"those",
"through",
- "to",
- "too",
"under",
- "up",
"very",
- "was",
- "way",
- "we",
"well",
"were",
"what",
"while",
"with",
"would",
- "you",
"your"
};
ch = *ptr;
if ( (!isalnum(ch)) && (word_start) ) {
word_end = ptr;
- --word_end;
+// --word_end;
/* extract the word */
- word_len = word_end - word_start + 1;
- safestrncpy(word, word_start, sizeof word);
+ word_len = word_end - word_start;
if (word_len >= sizeof word) {
lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
- word[(sizeof word_len) - 1] = 0;
+ safestrncpy(word, word_start, sizeof word);
+ word[(sizeof word) - 1] = 0;
}
else {
+ safestrncpy(word, word_start, word_len+1);
word[word_len] = 0;
}
word_start = NULL;
- /* disqualify noise words */
- for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
- if (!strcasecmp(word, noise_words[i])) {
- word_len = 0;
- break;
- }
- }
-
/* are we ok with the length? */
if ( (word_len >= WB_MIN)
&& (word_len <= WB_MAX) ) {
for (i=0; i<word_len; ++i) {
word[i] = tolower(word[i]);
}
+ /* disqualify noise words */
+ for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
+ if (!strcmp(word, noise_words[i])) {
+ word_len = 0;
+ break;
+ }
+ }
+ if (word_len == 0)
+ continue;
+
word_crc = (int)
CalcCRC16Bytes(word_len, word);
* later on, or even if we update this one, we can use a different ID so the
* system knows it needs to throw away the existing index and rebuild it.
*/
-#define FT_WORDBREAKER_ID 0x001f
+#define FT_WORDBREAKER_ID 0x0021
/*
* Minimum and maximum length of words to index
*/
-#define WB_MIN 3
+#define WB_MIN 4 // nothing with 3 or less chars
#define WB_MAX 40
void wordbreaker(char *text, int *num_tokens, int **tokens);
int i;
static time_t last_index = 0L;
static time_t last_progress = 0L;
-
+ time_t run_time = 0L;
+ time_t end_time = 0L;
+
/*
* Don't do this if the site doesn't have it enabled.
*/
if (CitControl.MMfulltext >= CitControl.MMhighest) {
return; /* nothing to do! */
}
-
- lprintf(CTDL_DEBUG, "do_fulltext_indexing() started\n");
+
+ run_time = time(NULL);
+ lprintf(CTDL_DEBUG, "do_fulltext_indexing() started (%ld)\n", run_time);
/*
* If we've switched wordbreaker modules, burn the index and start
* over.
*/
begin_critical_section(S_CONTROL);
- lprintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n",
- CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID);
if (CitControl.fulltext_wordbreaker != FT_WORDBREAKER_ID) {
+ lprintf(CTDL_DEBUG, "wb ver on disk = %d, code ver = %d\n",
+ CitControl.fulltext_wordbreaker, FT_WORDBREAKER_ID);
lprintf(CTDL_INFO, "(re)initializing full text index\n");
cdb_trunc(CDB_FULLTEXT);
CitControl.MMfulltext = 0L;
ft_num_alloc = 0;
ft_newmsgs = NULL;
}
+ end_time = time(NULL);
+ lprintf(CTDL_DEBUG, "do_fulltext_indexing() duration (%ld)\n", end_time - run_time);
/* Save our place so we don't have to do this again */
ft_flush_cache();