250e1ecdd436bef27e2f40f925a552f9943beede
[citadel.git] / citadel / modules / rssclient / serv_rssclient.c
1 /*
2  * Bring external RSS and/or Atom feeds into rooms.  This module implements a
3  * very loose parser that scrapes both kinds of feeds and is not picky about
4  * the standards compliance of the source data.
5  *
6  * Copyright (c) 2007-2018 by the citadel.org team
7  *
8  * This program is open source software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 3.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
14  * GNU General Public License for more details.
15  */
16
17 #include <stdlib.h>
18 #include <unistd.h>
19 #include <stdio.h>
20
21 #if TIME_WITH_SYS_TIME
22 # include <sys/time.h>
23 # include <time.h>
24 #else
25 # if HAVE_SYS_TIME_H
26 #include <sys/time.h>
27 # else
28 #include <time.h>
29 # endif
30 #endif
31
32 #include <ctype.h>
33 #include <string.h>
34 #include <errno.h>
35 #include <sys/types.h>
36 #include <sys/stat.h>
37 #include <expat.h>
38 #include <curl/curl.h>
39 #include <libcitadel.h>
40 #include "citadel.h"
41 #include "server.h"
42 #include "citserver.h"
43 #include "support.h"
44 #include "config.h"
45 #include "threads.h"
46 #include "ctdl_module.h"
47 #include "msgbase.h"
48 #include "parsedate.h"
49 #include "database.h"
50 #include "citadel_dirs.h"
51 #include "md5.h"
52 #include "context.h"
53 #include "internet_addressing.h"
54
55 struct rssroom {
56         struct rssroom *next;
57         char *room;
58 };
59
60 struct rssurl {
61         struct rssurl *next;
62         char *url;
63         struct rssroom *rooms;
64 };
65
66 struct rssparser {
67         StrBuf *CData;
68         struct CtdlMessage *msg;
69         char *link;
70         char *description;
71         char *item_id;
72         struct rssroom *rooms;
73 };
74
75 time_t last_run = 0L;
76 struct CitContext rss_CC;
77 struct rssurl *rsstodo = NULL;
78
79
80 // This handler is called whenever an XML tag opens.
81 //
82 void rss_start_element(void *data, const char *el, const char **attribute)
83 {
84         struct rssparser *r = (struct rssparser *)data;
85         int i;
86
87         if (
88                 (!strcasecmp(el, "entry"))
89                 || (!strcasecmp(el, "item"))
90         ) {
91                 // this is the start of a new item(rss) or entry(atom)
92                 if (r->msg != NULL) {
93                         CM_Free(r->msg);
94                         r->msg = NULL;
95                 }
96                 r->msg = malloc(sizeof(struct CtdlMessage));
97                 memset(r->msg, 0, sizeof(struct CtdlMessage));
98                 r->msg->cm_magic = CTDLMESSAGE_MAGIC;
99                 r->msg->cm_anon_type = MES_NORMAL;
100                 r->msg->cm_format_type = FMT_RFC822;
101         }
102
103         else if (!strcasecmp(el, "link")) {                     // atom feeds have the link as an attribute
104                 for(i = 0; attribute[i]; i += 2) {
105                         if (!strcasecmp(attribute[i], "href")) {
106                                 if (r->link != NULL) {
107                                         free(r->link);
108                                         r->link = NULL;
109                                 }
110                                 r->link = strdup(attribute[i+1]);
111                                 striplt(r->link);
112                         }
113                 }
114         }
115 }
116
117
118 // This handler is called whenever an XML tag closes.
119 //
120 void rss_end_element(void *data, const char *el)
121 {
122         struct rssparser *r = (struct rssparser *)data;
123
124         if (                                                    // end of a new item(rss) or entry(atom)
125                 (!strcasecmp(el, "entry"))
126                 || (!strcasecmp(el, "item"))
127         ) {
128
129                 if (r->msg != NULL) {                           // Save the message to the rooms
130
131                         // use the link as an item id if nothing else is available
132                         if ((r->item_id == NULL) && (r->link != NULL)) {
133                                 r->item_id = strdup(r->link);
134                         }
135
136                         // check the use table
137                         StrBuf *u = NewStrBuf();
138                         StrBufAppendPrintf(u, "rss/%s", r->item_id);
139                         int already_seen = CheckIfAlreadySeen(u);
140                         FreeStrBuf(&u);
141
142                         if (already_seen == 0) {
143
144                                 // Compose the message text
145                                 StrBuf *TheMessage = NewStrBuf();
146                                 StrBufAppendPrintf(TheMessage,
147                                         "Content-type: text/html\n\n"
148                                         "\n\n"
149                                         "<html><head></head><body>"
150                                 );
151                 
152                                 if (r->description != NULL) {
153                                         StrBufAppendPrintf(TheMessage, "%s<br><br>\r\n", r->description);
154                                         free(r->description);
155                                         r->description = NULL;
156                                 }
157                 
158                                 if (r->link != NULL) {
159                                         StrBufAppendPrintf(TheMessage, "<a href=\"%s\">%s</a>\r\n", r->link, r->link);
160                                         free(r->link);
161                                         r->link = NULL;
162                                 }
163         
164                                 StrBufAppendPrintf(TheMessage, "</body></html>\r\n");
165                                 CM_SetField(r->msg, eMesageText, ChrPtr(TheMessage), StrLength(TheMessage));
166                                 FreeStrBuf(&TheMessage);
167         
168                                 if (CM_IsEmpty(r->msg, eAuthor)) {
169                                         CM_SetField(r->msg, eAuthor, HKEY("rss"));
170                                 }
171         
172                                 if (CM_IsEmpty(r->msg, eTimestamp)) {
173                                         CM_SetFieldLONG(r->msg, eTimestamp, time(NULL));
174                                 }
175         
176                                 // Save it to the room(s)
177                                 struct rssroom *rr = NULL;
178                                 long msgnum = (-1);
179                                 for (rr=r->rooms; rr!=NULL; rr=rr->next) {
180                                         if (rr == r->rooms) {
181                                                 msgnum = CtdlSubmitMsg(r->msg, NULL, rr->room, 0);
182                                         }
183                                         else {
184                                                 CtdlSaveMsgPointerInRoom(rr->room, msgnum, 0, NULL);
185                                         }
186                                         syslog(LOG_DEBUG, "rssclient: saved message %ld to %s", msgnum, rr->room);
187                                 }
188                         }
189                         else {
190                                 syslog(LOG_DEBUG, "rssclient: already seen %s", r->item_id);
191                         }
192         
193                         CM_Free(r->msg);
194                         r->msg = NULL;
195                 }
196
197                 if (r->item_id != NULL) {
198                         free(r->item_id);
199                         r->item_id = NULL;
200                 }
201         }
202
203         else if (!strcasecmp(el, "title")) {                    // item subject (rss and atom)
204                 if ((r->msg != NULL) && (CM_IsEmpty(r->msg, eMsgSubject))) {
205                         CM_SetField(r->msg, eMsgSubject, ChrPtr(r->CData), StrLength(r->CData));
206                         striplt(r->msg->cm_fields[eMsgSubject]);
207                 }
208         }
209
210         else if (!strcasecmp(el, "creator")) {                  // <creator> can be used if <author> is not present
211                 if ((r->msg != NULL) && (CM_IsEmpty(r->msg, eAuthor))) {
212                         CM_SetField(r->msg, eAuthor, ChrPtr(r->CData), StrLength(r->CData));
213                         striplt(r->msg->cm_fields[eAuthor]);
214                 }
215         }
216
217         else if (!strcasecmp(el, "author")) {                   // <author> supercedes <creator> if both are present
218                 if (r->msg != NULL) {
219                         CM_SetField(r->msg, eAuthor, ChrPtr(r->CData), StrLength(r->CData));    // CM_SetField will free() the previous value
220                         striplt(r->msg->cm_fields[eAuthor]);
221                 }
222         }
223
224         else if (!strcasecmp(el, "pubdate")) {                  // date/time stamp (rss) Sat, 25 Feb 2017 14:28:01 EST
225                 if ((r->msg)&&(r->msg->cm_fields[eTimestamp]==NULL)) {
226                         CM_SetFieldLONG(r->msg, eTimestamp, parsedate(ChrPtr(r->CData)));
227                 }
228         }
229
230         else if (!strcasecmp(el, "updated")) {                  // date/time stamp (atom) 2003-12-13T18:30:02Z
231                 if ((r->msg)&&(r->msg->cm_fields[eTimestamp]==NULL)) {
232                         struct tm t;
233                         char zulu;
234                         memset(&t, 0, sizeof t);
235                         sscanf(ChrPtr(r->CData), "%d-%d-%dT%d:%d:%d%c", &t.tm_year, &t.tm_mon, &t.tm_mday, &t.tm_hour, &t.tm_min, &t.tm_sec, &zulu);
236                         t.tm_year -= 1900;
237                         t.tm_mon -= 1;
238                         CM_SetFieldLONG(r->msg, eTimestamp, mktime(&t));
239                 }
240         }
241
242         else if (!strcasecmp(el, "link")) {                     // link to story (rss)
243                 if (r->link != NULL) {
244                         free(r->link);
245                         r->link = NULL;
246                 }
247                 r->link = strdup(ChrPtr(r->CData));
248                 striplt(r->link);
249         }
250
251         else if (
252                 (!strcasecmp(el, "guid"))                       // unique item id (rss)
253                 || (!strcasecmp(el, "id"))                      // unique item id (atom)
254         ) {
255                 if (r->item_id != NULL) {
256                         free(r->item_id);
257                         r->item_id = NULL;
258                 }
259                 r->item_id = strdup(ChrPtr(r->CData));
260                 striplt(r->item_id);
261         }
262
263         else if (
264                 (!strcasecmp(el, "description"))                // message text (rss)
265                 || (!strcasecmp(el, "summary"))                 // message text (atom)
266                 || (!strcasecmp(el, "content"))                 // message text (atom)
267         ) {
268                 if (r->description != NULL) {
269                         free(r->description);
270                         r->description = NULL;
271                 }
272                 r->description = strdup(ChrPtr(r->CData));
273                 striplt(r->description);
274         }
275
276         if (r->CData != NULL) {
277                 FreeStrBuf(&r->CData);
278                 r->CData = NULL;
279         }
280 }
281
282
283 // This handler is called whenever data appears between opening and closing tags.
284 //
285 void rss_handle_data(void *data, const char *content, int length)
286 {
287         struct rssparser *r = (struct rssparser *)data;
288
289         if (r->CData == NULL) {
290                 r->CData = NewStrBuf();
291         }
292
293         StrBufAppendBufPlain(r->CData, content, length, 0);
294 }
295
296
297 // Feed has been downloaded, now parse it.
298 //
299 void rss_parse_feed(StrBuf *Feed, struct rssroom *rooms)
300 {
301         struct rssparser r;
302
303         memset(&r, 0, sizeof r);
304         r.rooms = rooms;
305         XML_Parser p = XML_ParserCreate("UTF-8");
306         XML_SetElementHandler(p, rss_start_element, rss_end_element);
307         XML_SetCharacterDataHandler(p, rss_handle_data);
308         XML_SetUserData(p, (void *)&r);
309         XML_Parse(p, ChrPtr(Feed), StrLength(Feed), XML_TRUE);
310         XML_ParserFree(p);
311 }
312
313
314 // Add a feed/room pair into the todo list
315 //
316 void rssclient_push_todo(char *rssurl, char *roomname)
317 {
318         struct rssurl *r = NULL;
319         struct rssurl *thisone = NULL;
320         struct rssroom *newroom = NULL;
321
322         syslog(LOG_DEBUG, "rssclient: will fetch %s to %s", rssurl, roomname);
323
324         for (r=rsstodo; r!=NULL; r=r->next) {
325                 if (!strcasecmp(r->url, rssurl)) {
326                         thisone = r;
327                 }
328         }
329         if (thisone == NULL) {
330                 thisone = malloc(sizeof(struct rssurl));
331                 thisone->url = strdup(rssurl);
332                 thisone->rooms = NULL;
333                 thisone->next = rsstodo;
334                 rsstodo = thisone;
335         }
336
337         newroom = malloc(sizeof(struct rssroom));
338         newroom->room = strdup(roomname);
339         newroom->next = thisone->rooms;
340         thisone->rooms = newroom;
341 }
342
343
344 // pull one feed (possibly multiple rooms)
345 //
346 void rss_pull_one_feed(struct rssurl *url)
347 {
348         CURL *curl;
349         CURLcode res;
350         StrBuf *Downloaded = NULL;
351
352         syslog(LOG_DEBUG, "rssclient: fetching %s", url->url);
353
354         curl = curl_easy_init();
355         if (!curl) {
356                 return;
357         }
358
359         Downloaded = NewStrBuf();
360
361         curl_easy_setopt(curl, CURLOPT_URL, url->url);
362         curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);
363         curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L);
364         curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);                     // Follow redirects
365         curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, CurlFillStrBuf_callback); // What to do with downloaded data
366         curl_easy_setopt(curl, CURLOPT_WRITEDATA, Downloaded);                  // Give it our StrBuf to work with
367         curl_easy_setopt(curl, CURLOPT_TIMEOUT, 20L);                           // Time out after 20 seconds
368         res = curl_easy_perform(curl);                                          // Perform the request
369         if (res != CURLE_OK) {
370                 syslog(LOG_WARNING, "rssclient: failed to load feed: %s", curl_easy_strerror(res));
371         }
372         curl_easy_cleanup(curl);
373
374         rss_parse_feed(Downloaded, url->rooms);                                 // parse the feed
375         FreeStrBuf(&Downloaded);                                                // free the downloaded feed data
376 }
377
378
379 // We have a list, now download the feeds
380 //
381 void rss_pull_feeds(void)
382 {
383         struct rssurl *r;
384         struct rssroom *rr;
385
386         while (rsstodo != NULL) {
387                 rss_pull_one_feed(rsstodo);
388                 r = rsstodo;
389                 rsstodo = rsstodo->next;
390                 while (r->rooms != NULL) {
391                         rr = r->rooms;
392                         r->rooms = r->rooms->next;
393                         free(rr->room);
394                         free(rr);
395                 }
396                 free(r->url);
397                 free(r);
398         }
399 }
400
401
402 // Scan a room's netconfig looking for RSS feed parsing requests
403 //
404 void rssclient_scan_room(struct ctdlroom *qrbuf, void *data)
405 {
406         char *serialized_config = NULL;
407         int num_configs = 0;
408         char cfgline[SIZ];
409         int i = 0;
410
411         serialized_config = LoadRoomNetConfigFile(qrbuf->QRnumber);
412         if (!serialized_config) {
413                 return;
414         }
415
416         num_configs = num_tokens(serialized_config, '\n');
417         for (i=0; i<num_configs; ++i) {
418                 extract_token(cfgline, serialized_config, i, '\n', sizeof cfgline);
419                 if (!strncasecmp(cfgline, HKEY("rssclient|"))) {
420                         strcpy(cfgline, &cfgline[10]);
421                         char *vbar = strchr(cfgline, '|');
422                         if (vbar != NULL) {
423                                 *vbar = 0;
424                         }
425                         rssclient_push_todo(cfgline, qrbuf->QRname);
426                 }
427         }
428
429         free(serialized_config);
430 }
431
432
433 /*
434  * Scan for rooms that have RSS client requests configured
435  */
436 void rssclient_scan(void) {
437         time_t now = time(NULL);
438
439         /* Run no more than once every 15 minutes. */
440         if ((now - last_run) < 900) {
441                 syslog(LOG_DEBUG,
442                         "rssclient: polling interval not yet reached; last run was %ldm%lds ago",
443                         ((now - last_run) / 60),
444                         ((now - last_run) % 60)
445                 );
446                 return;
447         }
448
449         become_session(&rss_CC);
450         syslog(LOG_DEBUG, "rssclient: started");
451         CtdlForEachRoom(rssclient_scan_room, NULL);
452         rss_pull_feeds();
453         syslog(LOG_DEBUG, "rssclient: ended");
454         last_run = time(NULL);
455         return;
456 }
457
458
459 CTDL_MODULE_INIT(rssclient)
460 {
461         if (!threading)
462         {
463                 syslog(LOG_INFO, "rssclient: using %s", curl_version());
464                 CtdlRegisterSessionHook(rssclient_scan, EVT_TIMER, PRIO_AGGR + 300);
465         }
466         else
467         {
468                 CtdlFillSystemContext(&rss_CC, "rssclient");
469         }
470         return "rssclient";
471 }
472