move stuff out into own files from here...
[citadel.git] / citadel / modules / rssclient / serv_rssclient.c
1 /*
2  * Bring external RSS feeds into rooms.
3  *
4  * Copyright (c) 2007-2010 by the citadel.org team
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  */
20
21 #include <stdlib.h>
22 #include <unistd.h>
23 #include <stdio.h>
24
25 #if TIME_WITH_SYS_TIME
26 # include <sys/time.h>
27 # include <time.h>
28 #else
29 # if HAVE_SYS_TIME_H
30 #  include <sys/time.h>
31 # else
32 #  include <time.h>
33 # endif
34 #endif
35
36 #include <ctype.h>
37 #include <string.h>
38 #include <errno.h>
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <expat.h>
42 #include <curl/curl.h>
43 #include <libcitadel.h>
44 #include "citadel.h"
45 #include "server.h"
46 #include "citserver.h"
47 #include "support.h"
48 #include "config.h"
49 #include "threads.h"
50 #include "ctdl_module.h"
51 #include "msgbase.h"
52 #include "parsedate.h"
53 #include "database.h"
54 #include "citadel_dirs.h"
55 #include "md5.h"
56 #include "context.h"
57 #include "event_client.h"
58 #include "rss_atom_parser.h"
59
60
61 struct rssnetcfg *rnclist = NULL;
62 void AppendLink(StrBuf *Message, StrBuf *link, StrBuf *LinkTitle, const char *Title)
63 {
64         if (StrLength(link) > 0)
65         {
66                 StrBufAppendBufPlain(Message, HKEY("<a href=\""), 0);
67                 StrBufAppendBuf(Message, link, 0);
68                 StrBufAppendBufPlain(Message, HKEY("\">"), 0);
69                 if (StrLength(LinkTitle) > 0)
70                         StrBufAppendBuf(Message, LinkTitle, 0);
71                 else if ((Title != NULL) && !IsEmptyStr(Title))
72                         StrBufAppendBufPlain(Message, Title, -1, 0);
73                 else
74                         StrBufAppendBuf(Message, link, 0);
75                 StrBufAppendBufPlain(Message, HKEY("</a><br>\n"), 0);
76         }
77 }
78 /*
79  * Commit a fetched and parsed RSS item to disk
80  */
81 void rss_save_item(rss_item *ri)
82 {
83
84         struct MD5Context md5context;
85         u_char rawdigest[MD5_DIGEST_LEN];
86         int i;
87         char utmsgid[SIZ];
88         struct cdbdata *cdbut;
89         struct UseTable ut;
90         struct CtdlMessage *msg;
91         struct recptypes *recp = NULL;
92         int msglen = 0;
93         StrBuf *Message;
94
95         recp = (struct recptypes *) malloc(sizeof(struct recptypes));
96         if (recp == NULL) return;
97         memset(recp, 0, sizeof(struct recptypes));
98         memset(&ut, 0, sizeof(struct UseTable));
99         recp->recp_room = strdup(ri->roomlist);
100         recp->num_room = num_tokens(ri->roomlist, '|');
101         recp->recptypes_magic = RECPTYPES_MAGIC;
102    
103         /* Construct a GUID to use in the S_USETABLE table.
104          * If one is not present in the item itself, make one up.
105          */
106         if (ri->guid != NULL) {
107                 StrBufSpaceToBlank(ri->guid);
108                 StrBufTrim(ri->guid);
109                 snprintf(utmsgid, sizeof utmsgid, "rss/%s", ChrPtr(ri->guid));
110         }
111         else {
112                 MD5Init(&md5context);
113                 if (ri->title != NULL) {
114                         MD5Update(&md5context, (const unsigned char*)ChrPtr(ri->title), StrLength(ri->title));
115                 }
116                 if (ri->link != NULL) {
117                         MD5Update(&md5context, (const unsigned char*)ChrPtr(ri->link), StrLength(ri->link));
118                 }
119                 MD5Final(rawdigest, &md5context);
120                 for (i=0; i<MD5_DIGEST_LEN; i++) {
121                         sprintf(&utmsgid[i*2], "%02X", (unsigned char) (rawdigest[i] & 0xff));
122                         utmsgid[i*2] = tolower(utmsgid[i*2]);
123                         utmsgid[(i*2)+1] = tolower(utmsgid[(i*2)+1]);
124                 }
125                 strcat(utmsgid, "_rss2ctdl");
126         }
127
128         /* Find out if we've already seen this item */
129
130         cdbut = cdb_fetch(CDB_USETABLE, utmsgid, strlen(utmsgid));
131 #ifndef DEBUG_RSS
132         if (cdbut != NULL) {
133                 /* Item has already been seen */
134                 CtdlLogPrintf(CTDL_DEBUG, "%s has already been seen\n", utmsgid);
135                 cdb_free(cdbut);
136
137                 /* rewrite the record anyway, to update the timestamp */
138                 strcpy(ut.ut_msgid, utmsgid);
139                 ut.ut_timestamp = time(NULL);
140                 cdb_store(CDB_USETABLE, utmsgid, strlen(utmsgid), &ut, sizeof(struct UseTable) );
141         }
142         else
143 #endif
144 {
145                 /* Item has not been seen, so save it. */
146                 CtdlLogPrintf(CTDL_DEBUG, "RSS: saving item...\n");
147                 if (ri->description == NULL) ri->description = NewStrBufPlain(HKEY(""));
148                 StrBufSpaceToBlank(ri->description);
149                 msg = malloc(sizeof(struct CtdlMessage));
150                 memset(msg, 0, sizeof(struct CtdlMessage));
151                 msg->cm_magic = CTDLMESSAGE_MAGIC;
152                 msg->cm_anon_type = MES_NORMAL;
153                 msg->cm_format_type = FMT_RFC822;
154
155                 if (ri->guid != NULL) {
156                         msg->cm_fields['E'] = strdup(ChrPtr(ri->guid));
157                 }
158
159                 if (ri->author_or_creator != NULL) {
160                         char *From;
161                         StrBuf *Encoded = NULL;
162                         int FromAt;
163                         
164                         From = html_to_ascii(ChrPtr(ri->author_or_creator),
165                                              StrLength(ri->author_or_creator), 
166                                              512, 0);
167                         StrBufPlain(ri->author_or_creator, From, -1);
168                         StrBufTrim(ri->author_or_creator);
169                         free(From);
170
171                         FromAt = strchr(ChrPtr(ri->author_or_creator), '@') != NULL;
172                         if (!FromAt && StrLength (ri->author_email) > 0)
173                         {
174                                 StrBufRFC2047encode(&Encoded, ri->author_or_creator);
175                                 msg->cm_fields['A'] = SmashStrBuf(&Encoded);
176                                 msg->cm_fields['P'] = SmashStrBuf(&ri->author_email);
177                         }
178                         else
179                         {
180                                 if (FromAt)
181                                         msg->cm_fields['P'] = SmashStrBuf(&ri->author_or_creator);
182                                 else 
183                                 {
184                                         StrBufRFC2047encode(&Encoded, ri->author_or_creator);
185                                         msg->cm_fields['A'] = SmashStrBuf(&Encoded);
186                                         msg->cm_fields['P'] = strdup("rss@localhost");
187                                 }
188                         }
189                 }
190                 else {
191                         msg->cm_fields['A'] = strdup("rss");
192                 }
193
194                 msg->cm_fields['N'] = strdup(NODENAME);
195                 if (ri->title != NULL) {
196                         long len;
197                         char *Sbj;
198                         StrBuf *Encoded, *QPEncoded;
199
200                         QPEncoded = NULL;
201                         StrBufSpaceToBlank(ri->title);
202                         len = StrLength(ri->title);
203                         Sbj = html_to_ascii(ChrPtr(ri->title), len, 512, 0);
204                         len = strlen(Sbj);
205                         if (Sbj[len - 1] == '\n')
206                         {
207                                 len --;
208                                 Sbj[len] = '\0';
209                         }
210                         Encoded = NewStrBufPlain(Sbj, len);
211                         free(Sbj);
212
213                         StrBufTrim(Encoded);
214                         StrBufRFC2047encode(&QPEncoded, Encoded);
215
216                         msg->cm_fields['U'] = SmashStrBuf(&QPEncoded);
217                         FreeStrBuf(&Encoded);
218                 }
219                 msg->cm_fields['T'] = malloc(64);
220                 snprintf(msg->cm_fields['T'], 64, "%ld", ri->pubdate);
221                 if (ri->channel_title != NULL) {
222                         if (StrLength(ri->channel_title) > 0) {
223                                 msg->cm_fields['O'] = strdup(ChrPtr(ri->channel_title));
224                         }
225                 }
226                 if (ri->link == NULL) 
227                         ri->link = NewStrBufPlain(HKEY(""));
228                 // TODO: reenable me    ExpandShortUrls(ri->description);
229                 msglen += 1024 + StrLength(ri->link) + StrLength(ri->description) ;
230
231                 Message = NewStrBufPlain(NULL, StrLength(ri->description));
232
233                 StrBufPlain(Message, HKEY(
234                          "Content-type: text/html; charset=\"UTF-8\"\r\n\r\n"
235                          "<html><body>\n"));
236
237                 StrBufAppendBuf(Message, ri->description, 0);
238                 StrBufAppendBufPlain(Message, HKEY("<br><br>\n"), 0);
239
240                 AppendLink(Message, ri->link, ri->linkTitle, NULL);
241                 AppendLink(Message, ri->reLink, ri->reLinkTitle, "Reply to this");
242                 StrBufAppendBufPlain(Message, HKEY("</body></html>\n"), 0);
243
244                 msg->cm_fields['M'] = SmashStrBuf(&Message);
245
246                 CtdlSubmitMsg(msg, recp, NULL, 0);
247                 CtdlFreeMessage(msg);
248
249                 /* write the uidl to the use table so we don't store this item again */
250                 strcpy(ut.ut_msgid, utmsgid);
251                 ut.ut_timestamp = time(NULL);
252                 cdb_store(CDB_USETABLE, utmsgid, strlen(utmsgid), &ut, sizeof(struct UseTable) );
253         }
254         free_recipients(recp);
255 }
256
257
258
259
260
261 /*
262  * Begin a feed parse
263  */
264 void rss_do_fetching(rssnetcfg *Cfg) {
265         rsscollection *rssc;
266         rss_item *ri;
267                 
268         time_t now;
269
270         CURL *chnd;
271         AsyncIO *IO;
272
273         now = time(NULL);
274
275         if ((Cfg->next_poll != 0) && (now < Cfg->next_poll))
276                 return;
277
278
279         ri = (rss_item*) malloc(sizeof(rss_item));
280         rssc = (rsscollection*) malloc(sizeof(rsscollection));
281         memset(ri, 0, sizeof(rss_item));
282         memset(rssc, 0, sizeof(rsscollection));
283         rssc->Item = ri;
284         rssc->Cfg = Cfg;
285         IO = &rssc->IO;
286         IO->CitContext = CloneContext(CC);
287         IO->Data = rssc;
288         ri->roomlist = Cfg->rooms;
289
290
291         CtdlLogPrintf(CTDL_DEBUG, "Fetching RSS feed <%s>\n", ChrPtr(Cfg->Url));
292         ParseURL(&IO->ConnectMe, Cfg->Url, 80);
293
294         if (! evcurl_init(IO, 
295 //                        Ctx, 
296                           NULL,
297                           "Citadel RSS Client",
298                           ParseRSSReply))
299         {
300                 CtdlLogPrintf(CTDL_ALERT, "Unable to initialize libcurl.\n");
301 //              goto abort;
302         }
303         chnd = IO->HttpReq.chnd;
304
305         evcurl_handle_start(IO);
306 }
307
308
309
310
311 /*
312  * Scan a room's netconfig to determine whether it is requesting any RSS feeds
313  */
314 void rssclient_scan_room(struct ctdlroom *qrbuf, void *data)
315 {
316         char filename[PATH_MAX];
317         char buf[1024];
318         char instr[32];
319         FILE *fp;
320         char feedurl[256];
321         rssnetcfg *rncptr = NULL;
322         rssnetcfg *use_this_rncptr = NULL;
323         int len = 0;
324         char *ptr = NULL;
325
326         assoc_file_name(filename, sizeof filename, qrbuf, ctdl_netcfg_dir);
327
328         if (CtdlThreadCheckStop())
329                 return;
330                 
331         /* Only do net processing for rooms that have netconfigs */
332         fp = fopen(filename, "r");
333         if (fp == NULL) {
334                 return;
335         }
336
337         while (fgets(buf, sizeof buf, fp) != NULL && !CtdlThreadCheckStop()) {
338                 buf[strlen(buf)-1] = 0;
339
340                 extract_token(instr, buf, 0, '|', sizeof instr);
341                 if (!strcasecmp(instr, "rssclient")) {
342
343                         use_this_rncptr = NULL;
344
345                         extract_token(feedurl, buf, 1, '|', sizeof feedurl);
346
347                         /* If any other rooms have requested the same feed, then we will just add this
348                          * room to the target list for that client request.
349                          */
350                         for (rncptr=rnclist; rncptr!=NULL; rncptr=rncptr->next) {
351                                 if (!strcmp(ChrPtr(rncptr->Url), feedurl)) {
352                                         use_this_rncptr = rncptr;
353                                 }
354                         }
355
356                         /* Otherwise create a new client request */
357                         if (use_this_rncptr == NULL) {
358                                 rncptr = (rssnetcfg *) malloc(sizeof(rssnetcfg));
359                                 memset(rncptr, 0, sizeof(rssnetcfg));
360                                 rncptr->ItemType = RSS_UNSET;
361                                 if (rncptr != NULL) {
362                                         rncptr->next = rnclist;
363                                         rncptr->Url = NewStrBufPlain(feedurl, -1);
364                                         rncptr->rooms = NULL;
365                                         rnclist = rncptr;
366                                         use_this_rncptr = rncptr;
367                                 }
368                         }
369
370                         /* Add the room name to the request */
371                         if (use_this_rncptr != NULL) {
372                                 if (use_this_rncptr->rooms == NULL) {
373                                         rncptr->rooms = strdup(qrbuf->QRname);
374                                 }
375                                 else {
376                                         len = strlen(use_this_rncptr->rooms) + strlen(qrbuf->QRname) + 5;
377                                         ptr = realloc(use_this_rncptr->rooms, len);
378                                         if (ptr != NULL) {
379                                                 strcat(ptr, "|");
380                                                 strcat(ptr, qrbuf->QRname);
381                                                 use_this_rncptr->rooms = ptr;
382                                         }
383                                 }
384                         }
385                 }
386
387         }
388
389         fclose(fp);
390
391 }
392
393 /*
394  * Scan for rooms that have RSS client requests configured
395  */
396 void rssclient_scan(void) {
397         static time_t last_run = 0L;
398         static int doing_rssclient = 0;
399         rssnetcfg *rptr = NULL;
400
401         /*
402          * This is a simple concurrency check to make sure only one rssclient run
403          * is done at a time.  We could do this with a mutex, but since we
404          * don't really require extremely fine granularity here, we'll do it
405          * with a static variable instead.
406          */
407         if (doing_rssclient) return;
408         doing_rssclient = 1;
409
410         CtdlLogPrintf(CTDL_DEBUG, "rssclient started\n");
411         CtdlForEachRoom(rssclient_scan_room, NULL);
412
413         while (rnclist != NULL && !CtdlThreadCheckStop()) {
414                 rss_do_fetching(rnclist);
415                 rptr = rnclist;
416                 rnclist = rnclist->next;
417                 if (rptr->rooms != NULL) free(rptr->rooms);
418                 free(rptr);
419         }
420
421         CtdlLogPrintf(CTDL_DEBUG, "rssclient ended\n");
422         last_run = time(NULL);
423         doing_rssclient = 0;
424         return;
425 }
426
427
428 CTDL_MODULE_INIT(rssclient)
429 {
430         if (threading)
431         {
432                 CtdlLogPrintf(CTDL_INFO, "%s\n", curl_version());
433                 CtdlRegisterSessionHook(rssclient_scan, EVT_TIMER);
434         }
435         return "rssclient";
436 }