4 * Bring external RSS feeds into rooms.
6 * Copyright (c) 2007-2009 by the citadel.org team
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 3 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 #if TIME_WITH_SYS_TIME
28 # include <sys/time.h>
32 # include <sys/time.h>
41 #include <sys/types.h>
44 #include <curl/curl.h>
45 #include <libcitadel.h>
48 #include "citserver.h"
53 #include "ctdl_module.h"
54 #include "clientsocket.h"
56 #include "parsedate.h"
58 #include "citadel_dirs.h"
63 struct rssnetcfg *next;
78 char channel_title[256];
80 char *author_or_creator;
83 struct rssnetcfg *rnclist = NULL;
87 * Commit a fetched and parsed RSS item to disk
89 void rss_save_item(struct rss_item *ri) {
91 struct MD5Context md5context;
92 u_char rawdigest[MD5_DIGEST_LEN];
95 struct cdbdata *cdbut;
97 struct CtdlMessage *msg;
98 struct recptypes *recp = NULL;
101 recp = (struct recptypes *) malloc(sizeof(struct recptypes));
102 if (recp == NULL) return;
103 memset(recp, 0, sizeof(struct recptypes));
104 recp->recp_room = strdup(ri->roomlist);
105 recp->num_room = num_tokens(ri->roomlist, '|');
106 recp->recptypes_magic = RECPTYPES_MAGIC;
108 /* Construct a GUID to use in the S_USETABLE table.
109 * If one is not present in the item itself, make one up.
111 if (ri->guid != NULL) {
112 snprintf(utmsgid, sizeof utmsgid, "rss/%s", ri->guid);
115 MD5Init(&md5context);
116 if (ri->title != NULL) {
117 MD5Update(&md5context, (unsigned char*)ri->title, strlen(ri->title));
119 if (ri->link != NULL) {
120 MD5Update(&md5context, (unsigned char*)ri->link, strlen(ri->link));
122 MD5Final(rawdigest, &md5context);
123 for (i=0; i<MD5_DIGEST_LEN; i++) {
124 sprintf(&utmsgid[i*2], "%02X", (unsigned char) (rawdigest[i] & 0xff));
125 utmsgid[i*2] = tolower(utmsgid[i*2]);
126 utmsgid[(i*2)+1] = tolower(utmsgid[(i*2)+1]);
128 strcat(utmsgid, "_rss2ctdl");
131 /* Find out if we've already seen this item */
132 cdbut = cdb_fetch(CDB_USETABLE, utmsgid, strlen(utmsgid));
134 /* Item has already been seen */
135 CtdlLogPrintf(CTDL_DEBUG, "%s has already been seen\n", utmsgid);
138 /* rewrite the record anyway, to update the timestamp */
139 strcpy(ut.ut_msgid, utmsgid);
140 ut.ut_timestamp = time(NULL);
141 cdb_store(CDB_USETABLE, utmsgid, strlen(utmsgid), &ut, sizeof(struct UseTable) );
144 /* Item has not been seen, so save it. */
146 if (ri->description == NULL) ri->description = strdup("");
147 for (i=strlen(ri->description); i>=0; --i) {
148 if (isspace(ri->description[i])) {
149 ri->description[i] = ' ';
153 msg = malloc(sizeof(struct CtdlMessage));
154 memset(msg, 0, sizeof(struct CtdlMessage));
155 msg->cm_magic = CTDLMESSAGE_MAGIC;
156 msg->cm_anon_type = MES_NORMAL;
157 msg->cm_format_type = FMT_RFC822;
159 if (ri->author_or_creator != NULL) {
160 msg->cm_fields['A'] = html_to_ascii(ri->author_or_creator,
161 strlen(ri->author_or_creator), 512, 0);
162 striplt(msg->cm_fields['A']);
165 msg->cm_fields['A'] = strdup("rss");
168 msg->cm_fields['N'] = strdup(NODENAME);
169 if (ri->title != NULL) {
170 msg->cm_fields['U'] = html_to_ascii(ri->title, strlen(ri->title), 512, 0);
171 striplt(msg->cm_fields['U']);
173 msg->cm_fields['T'] = malloc(64);
174 snprintf(msg->cm_fields['T'], 64, "%ld", ri->pubdate);
175 if (ri->channel_title != NULL) {
176 if (!IsEmptyStr(ri->channel_title)) {
177 msg->cm_fields['O'] = strdup(ri->channel_title);
180 if (ri->link == NULL)
181 ri->link = strdup("");
182 msglen += 1024 + strlen(ri->link) + strlen(ri->description) ;
183 msg->cm_fields['M'] = malloc(msglen);
184 snprintf(msg->cm_fields['M'], msglen,
185 "Content-type: text/html; charset=\"UTF-8\"\r\n\r\n"
188 "<a href=\"%s\">%s</a>\n"
195 CtdlSubmitMsg(msg, recp, NULL, 0);
196 CtdlFreeMessage(msg);
198 /* write the uidl to the use table so we don't store this item again */
199 strcpy(ut.ut_msgid, utmsgid);
200 ut.ut_timestamp = time(NULL);
201 cdb_store(CDB_USETABLE, utmsgid, strlen(utmsgid), &ut, sizeof(struct UseTable) );
203 free_recipients(recp);
209 * Convert an RDF/RSS datestamp into a time_t
211 time_t rdf_parsedate(char *p)
217 if (strlen(p) < 10) return 0L;
219 memset(&tm, 0, sizeof tm);
221 /* YYYY-MM-DDTHH:MM format...
223 if ( (p[4] == '-') && (p[7] == '-') ) {
224 tm.tm_year = atoi(&p[0]) - 1900;
225 tm.tm_mon = atoi(&p[5]) - 1;
226 tm.tm_mday = atoi(&p[8]);
227 if ( (p[10] == 'T') && (p[13] == ':') ) {
228 tm.tm_hour = atoi(&p[11]);
229 tm.tm_min = atoi(&p[14]);
234 /* hmm... try RFC822 date stamp format */
237 if (t > 0) return(t);
239 /* yeesh. ok, just return the current date and time. */
245 void rss_xml_start(void *data, const char *supplied_el, const char **attr) {
246 struct rss_item *ri = (struct rss_item *) data;
250 /* Axe the namespace, we don't care about it */
251 safestrncpy(el, supplied_el, sizeof el);
252 while (sep = strchr(el, ':'), sep) {
256 if (!strcasecmp(el, "item")) {
257 ++ri->item_tag_nesting;
259 /* Initialize the feed item data structure */
260 if (ri->guid != NULL) free(ri->guid);
262 if (ri->title != NULL) free(ri->title);
264 if (ri->link != NULL) free(ri->link);
266 if (ri->author_or_creator != NULL) free(ri->author_or_creator);
267 ri->author_or_creator = NULL;
268 if (ri->description != NULL) free(ri->description);
269 ri->description = NULL;
271 /* Throw away any existing character data */
272 if (ri->chardata_len > 0) {
275 ri->chardata_len = 0;
283 void rss_xml_end(void *data, const char *supplied_el) {
284 struct rss_item *ri = (struct rss_item *) data;
288 /* Axe the namespace, we don't care about it */
289 safestrncpy(el, supplied_el, sizeof el);
290 while (sep = strchr(el, ':'), sep) {
294 if ( (!strcasecmp(el, "title")) && (ri->item_tag_nesting == 0) && (ri->chardata != NULL) ) {
295 safestrncpy(ri->channel_title, ri->chardata, sizeof ri->channel_title);
296 striplt(ri->channel_title);
299 if ( (!strcasecmp(el, "guid")) && (ri->chardata != NULL) ) {
300 if (ri->guid != NULL) free(ri->guid);
301 striplt(ri->chardata);
302 ri->guid = strdup(ri->chardata);
305 if ( (!strcasecmp(el, "title")) && (ri->chardata != NULL) ) {
306 if (ri->title != NULL) free(ri->title);
307 striplt(ri->chardata);
308 ri->title = strdup(ri->chardata);
311 if ( (!strcasecmp(el, "link")) && (ri->chardata != NULL) ) {
312 if (ri->link != NULL) free(ri->link);
313 striplt(ri->chardata);
314 ri->link = strdup(ri->chardata);
317 if ( (!strcasecmp(el, "description")) && (ri->chardata != NULL) ) {
318 if (ri->description != NULL) free(ri->description);
319 ri->description = strdup(ri->chardata);
322 if ( ((!strcasecmp(el, "pubdate")) || (!strcasecmp(el, "date"))) && (ri->chardata != NULL) ) {
323 striplt(ri->chardata);
324 ri->pubdate = rdf_parsedate(ri->chardata);
327 if ( ((!strcasecmp(el, "author")) || (!strcasecmp(el, "creator"))) && (ri->chardata != NULL) ) {
328 if (ri->author_or_creator != NULL) free(ri->author_or_creator);
329 striplt(ri->chardata);
330 ri->author_or_creator = strdup(ri->chardata);
333 if (!strcasecmp(el, "item")) {
334 --ri->item_tag_nesting;
338 if ( (!strcasecmp(el, "rss")) || (!strcasecmp(el, "rdf")) ) {
339 CtdlLogPrintf(CTDL_DEBUG, "End of feed detected. Closing parser.\n");
340 ri->done_parsing = 1;
343 if (ri->chardata_len > 0) {
346 ri->chardata_len = 0;
353 * This callback stores up the data which appears in between tags.
355 void rss_xml_chardata(void *data, const XML_Char *s, int len) {
356 struct rss_item *ri = (struct rss_item *) data;
361 old_len = ri->chardata_len;
362 new_len = old_len + len;
363 new_buffer = realloc(ri->chardata, new_len + 1);
364 if (new_buffer != NULL) {
365 memcpy(&new_buffer[old_len], s, len);
366 new_buffer[new_len] = 0;
367 ri->chardata = new_buffer;
368 ri->chardata_len = new_len;
375 * Callback function for passing libcurl's output to expat for parsing
377 size_t rss_libcurl_callback(void *ptr, size_t size, size_t nmemb, void *stream)
379 XML_Parse((XML_Parser)stream, ptr, (size * nmemb), 0);
388 void rss_do_fetching(char *url, char *rooms) {
394 char errmsg[1024] = "";
396 CtdlLogPrintf(CTDL_DEBUG, "Fetching RSS feed <%s>\n", url);
398 curl = curl_easy_init();
400 CtdlLogPrintf(CTDL_ALERT, "Unable to initialize libcurl.\n");
404 xp = XML_ParserCreateNS("UTF-8", ':');
406 CtdlLogPrintf(CTDL_ALERT, "Cannot create XML parser!\n");
407 curl_easy_cleanup(curl);
411 curl_easy_setopt(curl, CURLOPT_URL, url);
412 curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0);
413 curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0);
414 curl_easy_setopt(curl, CURLOPT_WRITEDATA, xp);
415 curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, rss_libcurl_callback);
416 curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, errmsg);
417 curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1);
418 #ifdef CURLOPT_HTTP_CONTENT_DECODING
419 curl_easy_setopt(curl, CURLOPT_HTTP_CONTENT_DECODING, 1);
420 curl_easy_setopt(curl, CURLOPT_ENCODING, "");
422 curl_easy_setopt(curl, CURLOPT_USERAGENT, CITADEL);
423 curl_easy_setopt(curl, CURLOPT_TIMEOUT, 180); /* die after 180 seconds */
424 if (!IsEmptyStr(config.c_ip_addr)) {
425 curl_easy_setopt(curl, CURLOPT_INTERFACE, config.c_ip_addr);
428 memset(&ri, 0, sizeof(struct rss_item));
430 XML_SetElementHandler(xp, rss_xml_start, rss_xml_end);
431 XML_SetCharacterDataHandler(xp, rss_xml_chardata);
432 XML_SetUserData(xp, &ri);
434 if (CtdlThreadCheckStop())
437 curl_easy_cleanup(curl);
441 if (CtdlThreadCheckStop())
444 res = curl_easy_perform(curl);
446 CtdlLogPrintf(CTDL_ALERT, "libcurl error %d: %s\n", res, errmsg);
449 if (CtdlThreadCheckStop())
452 if (ri.done_parsing == 0) XML_Parse(xp, "", 0, 1);
456 curl_easy_cleanup(curl);
459 /* Free the feed item data structure */
460 if (ri.guid != NULL) free(ri.guid);
462 if (ri.title != NULL) free(ri.title);
464 if (ri.link != NULL) free(ri.link);
466 if (ri.author_or_creator != NULL) free(ri.author_or_creator);
467 ri.author_or_creator = NULL;
468 if (ri.description != NULL) free(ri.description);
469 ri.description = NULL;
470 if (ri.chardata_len > 0) {
479 * Scan a room's netconfig to determine whether it is requesting any RSS feeds
481 void rssclient_scan_room(struct ctdlroom *qrbuf, void *data)
483 char filename[PATH_MAX];
488 struct rssnetcfg *rncptr = NULL;
489 struct rssnetcfg *use_this_rncptr = NULL;
493 assoc_file_name(filename, sizeof filename, qrbuf, ctdl_netcfg_dir);
495 if (CtdlThreadCheckStop())
498 /* Only do net processing for rooms that have netconfigs */
499 fp = fopen(filename, "r");
504 while (fgets(buf, sizeof buf, fp) != NULL && !CtdlThreadCheckStop()) {
505 buf[strlen(buf)-1] = 0;
507 extract_token(instr, buf, 0, '|', sizeof instr);
508 if (!strcasecmp(instr, "rssclient")) {
510 use_this_rncptr = NULL;
512 extract_token(feedurl, buf, 1, '|', sizeof feedurl);
514 /* If any other rooms have requested the same feed, then we will just add this
515 * room to the target list for that client request.
517 for (rncptr=rnclist; rncptr!=NULL; rncptr=rncptr->next) {
518 if (!strcmp(rncptr->url, feedurl)) {
519 use_this_rncptr = rncptr;
523 /* Otherwise create a new client request */
524 if (use_this_rncptr == NULL) {
525 rncptr = (struct rssnetcfg *) malloc(sizeof(struct rssnetcfg));
526 if (rncptr != NULL) {
527 rncptr->next = rnclist;
528 safestrncpy(rncptr->url, feedurl, sizeof rncptr->url);
529 rncptr->rooms = NULL;
531 use_this_rncptr = rncptr;
535 /* Add the room name to the request */
536 if (use_this_rncptr != NULL) {
537 if (use_this_rncptr->rooms == NULL) {
538 rncptr->rooms = strdup(qrbuf->QRname);
541 len = strlen(use_this_rncptr->rooms) + strlen(qrbuf->QRname) + 5;
542 ptr = realloc(use_this_rncptr->rooms, len);
545 strcat(ptr, qrbuf->QRname);
546 use_this_rncptr->rooms = ptr;
559 * Scan for rooms that have RSS client requests configured
561 void *rssclient_scan(void *args) {
562 static time_t last_run = 0L;
563 static int doing_rssclient = 0;
564 struct rssnetcfg *rptr = NULL;
565 struct CitContext rssclientCC;
567 /* Give this thread its own private CitContext */
568 CtdlFillSystemContext(&rssclientCC, "rssclient");
569 citthread_setspecific(MyConKey, (void *)&rssclientCC );
571 CtdlThreadAllocTSD();
574 * This is a simple concurrency check to make sure only one rssclient run
575 * is done at a time. We could do this with a mutex, but since we
576 * don't really require extremely fine granularity here, we'll do it
577 * with a static variable instead.
579 if (doing_rssclient) return NULL;
582 CtdlLogPrintf(CTDL_DEBUG, "rssclient started\n");
583 ForEachRoom(rssclient_scan_room, NULL);
585 while (rnclist != NULL && !CtdlThreadCheckStop()) {
586 rss_do_fetching(rnclist->url, rnclist->rooms);
588 rnclist = rnclist->next;
589 if (rptr->rooms != NULL) free(rptr->rooms);
593 CtdlLogPrintf(CTDL_DEBUG, "rssclient ended\n");
594 last_run = time(NULL);
596 if (!CtdlThreadCheckStop())
597 CtdlThreadSchedule ("RSS Client", CTDLTHREAD_BIGSTACK, rssclient_scan, NULL, last_run + config.c_net_freq);
599 CtdlLogPrintf(CTDL_DEBUG, "rssclient: Task STOPPED.\n");
604 CTDL_MODULE_INIT(rssclient)
608 CtdlLogPrintf(CTDL_INFO, "%s\n", curl_version());
609 CtdlThreadSchedule ("RSS Client", CTDLTHREAD_BIGSTACK, rssclient_scan, NULL, 0);
611 /* return our Subversion id for the Log */