4a2c4f6471e5614cb09631271484eaa81bc7dee5
[citadel.git] / citadel / modules / rssclient / serv_rssclient.c
1 /*
2  * Bring external RSS feeds into rooms.
3  *
4  * Copyright (c) 2007-2017 by the citadel.org team
5  *
6  * This program is open source software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 3.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
12  * GNU General Public License for more details.
13  */
14
15 #include <stdlib.h>
16 #include <unistd.h>
17 #include <stdio.h>
18
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
21 # include <time.h>
22 #else
23 # if HAVE_SYS_TIME_H
24 #include <sys/time.h>
25 # else
26 #include <time.h>
27 # endif
28 #endif
29
30 #include <ctype.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <expat.h>
36 #include <curl/curl.h>
37 #include <libcitadel.h>
38 #include "citadel.h"
39 #include "server.h"
40 #include "citserver.h"
41 #include "support.h"
42 #include "config.h"
43 #include "threads.h"
44 #include "ctdl_module.h"
45 #include "msgbase.h"
46 #include "parsedate.h"
47 #include "database.h"
48 #include "citadel_dirs.h"
49 #include "md5.h"
50 #include "context.h"
51
52 struct rssroom {
53         struct rssroom *next;
54         char *room;
55 };
56
57 struct rssurl {
58         struct rssurl *next;
59         char *url;
60         struct rssroom *rooms;
61 };
62
63 struct rssparser {
64         StrBuf *CData;
65         struct CtdlMessage *msg;
66         char *link;
67         char *description;
68         struct rssroom *rooms;
69 };
70
71 time_t last_run = 0L;
72 struct CitContext rss_CC;
73 struct rssurl *rsstodo = NULL;
74
75
76 // This is what RSS probably looks like
77 //
78 //      <item>
79 //              <title><![CDATA[Felicity flexes action chops]]></title>
80 //              <link>http://video.foxnews.com/v/5336254459001/</link>
81 //              <author>foxnewsonline@foxnews.com (Fox News Online)</author>
82 //              <description />
83 //              <pubDate>Sat, 25 Feb 2017 14:28:01 EST</pubDate>
84 //      </item>
85
86
87 // This handler is called whenever an XML tag opens.
88 //
89 void rss_start_element(void *data, const char *el, const char **attribute)
90 {
91         struct rssparser *r = (struct rssparser *)data;
92         int i;
93
94         if (
95                 (!strcasecmp(el, "entry"))
96                 || (!strcasecmp(el, "item"))
97         ) {
98                 // this is the start of a new item(rss) or entry(atom)
99                 if (r->msg != NULL) {
100                         CM_Free(r->msg);
101                         r->msg = NULL;
102                 }
103                 r->msg = malloc(sizeof(struct CtdlMessage));
104                 memset(r->msg, 0, sizeof(struct CtdlMessage));
105                 r->msg->cm_magic = CTDLMESSAGE_MAGIC;
106                 r->msg->cm_anon_type = MES_NORMAL;
107                 r->msg->cm_format_type = FMT_RFC822;
108         }
109
110         else if (!strcasecmp(el, "link")) {                     // atom feeds have the link as an attribute
111                 for(i = 0; attribute[i]; i += 2) {
112                         if (!strcasecmp(attribute[i], "href")) {
113                                 if (r->link != NULL) {
114                                         free(r->link);
115                                         r->link = NULL;
116                                 }
117                                 r->link = strdup(attribute[i+1]);
118                                 striplt(r->link);
119                         }
120                 }
121         }
122 }
123
124
125 // This handler is called whenever an XML tag closes.
126 //
127 void rss_end_element(void *data, const char *el)
128 {
129         struct rssparser *r = (struct rssparser *)data;
130
131         if (                                                    // end of a new item(rss) or entry(atom)
132                 (!strcasecmp(el, "entry"))
133                 || (!strcasecmp(el, "item"))
134         ) {
135
136                 if (r->msg != NULL) {                           // Save the message to the rooms
137
138                                                                 // FIXME check the use table
139
140                         StrBuf *TheMessage = NewStrBuf();
141                         StrBufAppendPrintf(TheMessage,
142                                 "Content-type: text/html\n\n"
143                                 "\n\n"
144                                 "<html><head></head><body>"
145                         );
146         
147                         if (r->description != NULL) {
148                                 StrBufAppendPrintf(TheMessage, "%s<br><br>\r\n", r->description);
149                                 free(r->description);
150                                 r->description = NULL;
151                         }
152         
153                         if (r->link != NULL) {
154                                 StrBufAppendPrintf(TheMessage, "<a href=\"%s\">%s</a>\r\n", r->link, r->link);
155                                 free(r->link);
156                                 r->link = NULL;
157                         }
158
159                         StrBufAppendPrintf(TheMessage, "</body></html>\r\n");
160
161                         syslog(LOG_DEBUG, "------------------\n%s\n------------------", ChrPtr(TheMessage));
162                         FreeStrBuf(&TheMessage);
163
164
165
166                         struct rssroom *rr;
167                         for (rr=r->rooms; rr!=NULL; rr=rr->next) {
168                                 syslog(LOG_DEBUG, "Saving item %s to %s", r->link, rr->room);
169                         }
170                         CM_Free(r->msg);
171                         r->msg = NULL;
172                 }
173
174
175
176         }
177
178         else if (!strcasecmp(el, "title")) {                    // item subject (rss and atom)
179                 if ((r->msg != NULL) && (r->msg->cm_fields[eMsgSubject] == NULL)) {
180                         r->msg->cm_fields[eMsgSubject] = strdup(ChrPtr(r->CData));
181                         striplt(r->msg->cm_fields[eMsgSubject]);
182                 }
183         }
184
185         else if (!strcasecmp(el, "author")) {                   // author of item (rss and maybe atom)
186                 if ((r->msg != NULL) && (r->msg->cm_fields[eAuthor] == NULL)) {
187                         r->msg->cm_fields[eAuthor] = strdup(ChrPtr(r->CData));
188                         striplt(r->msg->cm_fields[eAuthor]);
189                 }
190         }
191
192         else if (!strcasecmp(el, "pubdate")) {                  // date/time stamp (rss) Sat, 25 Feb 2017 14:28:01 EST
193                 // FIXME parse it
194         }
195
196         else if (!strcasecmp(el, "updated")) {                  // date/time stamp (atom) 2003-12-13T18:30:02Z
197                 // FIXME parse it
198         }
199
200         else if (!strcasecmp(el, "link")) {                     // link to story (rss)
201                 if (r->link != NULL) {
202                         free(r->link);
203                         r->link = NULL;
204                 }
205                 r->link = strdup(ChrPtr(r->CData));
206                 striplt(r->link);
207         }
208
209         else if (
210                 (!strcasecmp(el, "description"))                // message text (rss)
211                 || (!strcasecmp(el, "summary"))                 // message text (atom)
212         ) {
213                 if (r->description != NULL) {
214                         free(r->description);
215                         r->description = NULL;
216                 }
217                 r->description = strdup(ChrPtr(r->CData));
218                 striplt(r->description);
219         }
220
221         if (r->CData != NULL) {
222                 FreeStrBuf(&r->CData);
223                 r->CData = NULL;
224         }
225 }
226
227
228 // This handler is called whenever data appears between opening and closing tags.
229 //
230 void rss_handle_data(void *data, const char *content, int length)
231 {
232         struct rssparser *r = (struct rssparser *)data;
233
234         if (r->CData == NULL) {
235                 r->CData = NewStrBuf();
236         }
237
238         StrBufAppendBufPlain(r->CData, content, length, 0);
239 }
240
241
242 // Feed has been downloaded, now parse it.
243 //
244 void rss_parse_feed(StrBuf *Feed, struct rssroom *rooms)
245 {
246         struct rssparser r;
247
248         memset(&r, 0, sizeof r);
249         r.rooms = rooms;
250         XML_Parser p = XML_ParserCreateNS("UTF-8", ':');
251         XML_SetElementHandler(p, rss_start_element, rss_end_element);
252         XML_SetCharacterDataHandler(p, rss_handle_data);
253         XML_SetUserData(p, (void *)&r);
254         XML_Parse(p, ChrPtr(Feed), StrLength(Feed), XML_TRUE);
255         XML_ParserFree(p);
256 }
257
258
259 // Add a feed/room pair into the todo list
260 //
261 void rssclient_push_todo(char *rssurl, char *roomname)
262 {
263         struct rssurl *r = NULL;
264         struct rssurl *thisone = NULL;
265         struct rssroom *newroom = NULL;
266
267         syslog(LOG_DEBUG, "rssclient_push_todo(%s, %s)", rssurl, roomname);
268
269         for (r=rsstodo; r!=NULL; r=r->next) {
270                 if (!strcasecmp(r->url, rssurl)) {
271                         thisone = r;
272                 }
273         }
274         if (thisone == NULL) {
275                 thisone = malloc(sizeof(struct rssurl));
276                 thisone->url = strdup(rssurl);
277                 thisone->rooms = NULL;
278                 thisone->next = rsstodo;
279                 rsstodo = thisone;
280         }
281
282         newroom = malloc(sizeof(struct rssroom));
283         newroom->room = strdup(roomname);
284         newroom->next = thisone->rooms;
285         thisone->rooms = newroom;
286 }
287
288
289 // Callback function for curl
290 //
291 size_t rss_pof_write_data(void *buffer, size_t size, size_t nmemb, void *userp)
292 {
293         StrBuf *Downloaded = (StrBuf *)userp;
294         size_t bytes = size * nmemb;
295         StrBufAppendBufPlain(Downloaded, buffer, bytes, 0);
296         return(bytes);
297 }
298
299
300 // pull one feed (possibly multiple rooms)
301 //
302 void rss_pull_one_feed(struct rssurl *url)
303 {
304         CURL *curl;
305         CURLcode res;
306         StrBuf *Downloaded = NULL;
307
308         syslog(LOG_DEBUG, "rss_pull_one_feed(%s)", url->url);
309
310         curl = curl_easy_init();
311         if (!curl) {
312                 return;
313         }
314
315         Downloaded = NewStrBuf();
316
317         curl_easy_setopt(curl, CURLOPT_URL, url->url);
318         curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);                     // Follow redirects
319         curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, rss_pof_write_data);      // What to do with downloaded data
320         curl_easy_setopt(curl, CURLOPT_WRITEDATA, Downloaded);                  // Give it our StrBuf to work with
321         curl_easy_setopt(curl, CURLOPT_TIMEOUT, 20L);                           // Time out after 20 seconds
322         res = curl_easy_perform(curl);                                          // Perform the request
323         if (res != CURLE_OK) {
324                 syslog(LOG_WARNING, "Failed to load feed: %s", curl_easy_strerror(res));
325         }
326         curl_easy_cleanup(curl);
327
328         rss_parse_feed(Downloaded, url->rooms);                                 // parse the feed
329         FreeStrBuf(&Downloaded);                                                // free the downloaded feed data
330 }
331
332
333 // We have a list, now download the feeds
334 //
335 void rss_pull_feeds(void)
336 {
337         struct rssurl *r;
338         struct rssroom *rr;
339
340         while (rsstodo != NULL) {
341                 rss_pull_one_feed(rsstodo);
342                 r = rsstodo;
343                 rsstodo = rsstodo->next;
344                 while (r->rooms != NULL) {
345                         rr = r->rooms;
346                         r->rooms = r->rooms->next;
347                         free(rr->room);
348                         free(rr);
349                 }
350                 free(r->url);
351                 free(r);
352         }
353 }
354
355
356 // Scan a room's netconfig looking for RSS feed parsing requests
357 //
358 void rssclient_scan_room(struct ctdlroom *qrbuf, void *data)
359 {
360         char *serialized_config = NULL;
361         int num_configs = 0;
362         char cfgline[SIZ];
363         int i = 0;
364
365         serialized_config = LoadRoomNetConfigFile(qrbuf->QRnumber);
366         if (!serialized_config) {
367                 return;
368         }
369
370         num_configs = num_tokens(serialized_config, '\n');
371         for (i=0; i<num_configs; ++i) {
372                 extract_token(cfgline, serialized_config, i, '\n', sizeof cfgline);
373                 if (!strncasecmp(cfgline, HKEY("rssclient|"))) {
374                         strcpy(cfgline, &cfgline[10]);
375                         char *vbar = strchr(cfgline, '|');
376                         if (vbar != NULL) {
377                                 *vbar = 0;
378                         }
379                         rssclient_push_todo(cfgline, qrbuf->QRname);
380                 }
381         }
382
383         free(serialized_config);
384 }
385
386
387 /*
388  * Scan for rooms that have RSS client requests configured
389  */
390 void rssclient_scan(void) {
391         time_t now = time(NULL);
392
393         /* Run no more than once every 15 minutes. */
394         if ((now - last_run) < 900) {
395                 syslog(LOG_DEBUG,
396                               "Client: polling interval not yet reached; last run was %ldm%lds ago",
397                               ((now - last_run) / 60),
398                               ((now - last_run) % 60)
399                 );
400                 return;
401         }
402
403         become_session(&rss_CC);
404         syslog(LOG_DEBUG, "rssclient started");
405         CtdlForEachRoom(rssclient_scan_room, NULL);
406         rss_pull_feeds();
407         syslog(LOG_DEBUG, "rssclient ended");
408         last_run = time(NULL);
409         return;
410 }
411
412
413 CTDL_MODULE_INIT(rssclient)
414 {
415         if (!threading)
416         {
417                 syslog(LOG_INFO, "%s", curl_version());
418                 CtdlRegisterSessionHook(rssclient_scan, EVT_TIMER, PRIO_AGGR + 300);
419         }
420         else
421         {
422                 CtdlFillSystemContext(&rss_CC, "rssclient");
423         }
424         return "rssclient";
425 }
426