/*
- * $Id$
- *
* Default wordbreaker module for full text indexing.
*
+ * Copyright (c) 2005-2017 by the citadel.org team
+ *
+ * This program is open source software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
*/
-
#include "sysdep.h"
#include <stdlib.h>
#include <unistd.h>
#include <pwd.h>
#include <errno.h>
#include <sys/types.h>
-
-#if TIME_WITH_SYS_TIME
-# include <sys/time.h>
-# include <time.h>
-#else
-# if HAVE_SYS_TIME_H
-# include <sys/time.h>
-# else
-# include <time.h>
-# endif
-#endif
-
+#include <time.h>
#include <sys/wait.h>
#include <ctype.h>
#include <string.h>
#include <limits.h>
+#include <libcitadel.h>
#include "citadel.h"
#include "server.h"
#include "sysdep_decls.h"
#include "database.h"
#include "msgbase.h"
#include "control.h"
-#include "tools.h"
#include "ft_wordbreaker.h"
#include "crc16.h"
+#include "ctdl_module.h"
/*
* Noise words are not included in search indices.
static char *noise_words[] = {
"about",
"after",
- "all",
"also",
- "an",
- "and",
"another",
- "any",
- "are",
- "as",
- "at",
- "be",
"because",
"been",
"before",
"being",
"between",
"both",
- "but",
- "by",
"came",
- "can",
"come",
"could",
- "did",
- "do",
"each",
- "for",
"from",
- "get",
- "got",
- "had",
- "has",
"have",
- "he",
- "her",
"here",
- "him",
"himself",
- "his",
- "how",
- "if",
- "in",
"into",
- "is",
- "it",
"like",
"make",
"many",
- "me",
"might",
"more",
"most",
"much",
"must",
- "my",
"never",
- "now",
- "of",
- "on",
"only",
- "or",
"other",
- "our",
- "out",
"over",
"said",
"same",
- "see",
"should",
"since",
"some",
"take",
"than",
"that",
- "the",
"their",
"them",
"then",
"this",
"those",
"through",
- "to",
- "too",
"under",
- "up",
"very",
- "was",
- "way",
- "we",
"well",
"were",
"what",
"while",
"with",
"would",
- "you",
"your"
};
+#define NUM_NOISE (sizeof(noise_words) / sizeof(char *))
+
/*
* Compare function
}
-void wordbreaker(char *text, int *num_tokens, int **tokens) {
+void wordbreaker(const char *text, int *num_tokens, int **tokens) {
int wb_num_tokens = 0;
int wb_num_alloc = 0;
int *wb_tokens = NULL;
- char *ptr;
- char *word_start;
- char *word_end;
+ const char *ptr;
+ const char *word_start;
+ const char *word_end;
char ch;
int word_len;
char word[256];
int i;
int word_crc;
-
+
if (text == NULL) { /* no NULL text please */
*num_tokens = 0;
*tokens = NULL;
ch = *ptr;
if ( (!isalnum(ch)) && (word_start) ) {
word_end = ptr;
- --word_end;
/* extract the word */
- word_len = word_end - word_start + 1;
- safestrncpy(word, word_start, sizeof word);
+ word_len = word_end - word_start;
if (word_len >= sizeof word) {
- lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
- word[(sizeof word_len) - 1] = 0;
+ syslog(LOG_DEBUG, "wordbreaker: invalid word length: %d", word_len);
+ safestrncpy(word, word_start, sizeof word);
+ word[(sizeof word) - 1] = 0;
}
else {
+ safestrncpy(word, word_start, word_len+1);
word[word_len] = 0;
}
word_start = NULL;
- /* disqualify noise words */
- for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
- if (!strcasecmp(word, noise_words[i])) {
- word_len = 0;
- break;
- }
- }
-
/* are we ok with the length? */
- if ( (word_len >= WB_MIN)
- && (word_len <= WB_MAX) ) {
+ if ( (word_len >= WB_MIN) && (word_len <= WB_MAX) ) {
for (i=0; i<word_len; ++i) {
word[i] = tolower(word[i]);
}
- word_crc = (int)
- CalcCRC16Bytes(word_len, word);
+ /* disqualify noise words */
+ for (i=0; i<NUM_NOISE; ++i) {
+ if (!strcmp(word, noise_words[i])) {
+ word_len = 0;
+ break;
+ }
+ }
+
+ if (word_len == 0)
+ continue;
+
+ word_crc = (int) CalcCRC16Bytes(word_len, word);
++wb_num_tokens;
if (wb_num_tokens > wb_num_alloc) {