c45e9358d7f81510ce39a5ecad4b88760363f418
[citadel.git] / citadel / modules / rssclient / rss_atom_parser.c
1 /*
2  * Bring external RSS feeds into rooms.
3  *
4  * Copyright (c) 2007-2012 by the citadel.org team
5  *
6  * This program is open source software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 3.
8  * 
9  * 
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * 
17  * 
18  * 
19  */
20
21 #include <stdlib.h>
22 #include <unistd.h>
23 #include <stdio.h>
24
25 #if TIME_WITH_SYS_TIME
26 # include <sys/time.h>
27 # include <time.h>
28 #else
29 # if HAVE_SYS_TIME_H
30 #  include <sys/time.h>
31 # else
32 #  include <time.h>
33 # endif
34 #endif
35
36 #include <ctype.h>
37 #include <string.h>
38 #include <errno.h>
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <expat.h>
42 #include <curl/curl.h>
43 #include <libcitadel.h>
44 #include "citadel.h"
45 #include "server.h"
46 #include "citserver.h"
47 #include "support.h"
48 #include "config.h"
49 #include "threads.h"
50 #include "ctdl_module.h"
51 #include "clientsocket.h"
52 #include "msgbase.h"
53 #include "parsedate.h"
54 #include "database.h"
55 #include "citadel_dirs.h"
56 #include "md5.h"
57 #include "context.h"
58 #include "event_client.h"
59 #include "rss_atom_parser.h"
60
61 void rss_remember_item(rss_item *ri, rss_aggregator *Cfg);
62
63 int RSSAtomParserDebugEnabled = 0;
64
65 #define N ((rss_aggregator*)IO->Data)->QRnumber
66
67 #define DBGLOG(LEVEL) if ((LEVEL != LOG_DEBUG) || (RSSAtomParserDebugEnabled != 0))
68
69 #define EVRSSATOM_syslog(LEVEL, FORMAT, ...)                            \
70         DBGLOG(LEVEL) syslog(LEVEL,                                     \
71                              "IO[%ld]CC[%d][%ld]RSSP" FORMAT,           \
72                              IO->ID, CCID, N, __VA_ARGS__)
73
74 #define EVRSSATOMM_syslog(LEVEL, FORMAT)                                \
75         DBGLOG(LEVEL) syslog(LEVEL,                                     \
76                              "IO[%ld]CC[%d][%ld]RSSP" FORMAT,           \
77                              IO->ID, CCID, N)
78
79 #define EVRSSATOMCS_syslog(LEVEL, FORMAT, ...)                  \
80         DBGLOG(LEVEL) syslog(LEVEL, "IO[%ld][%ld]RSSP" FORMAT,  \
81                              IO->ID, N, __VA_ARGS__)
82
83 #define EVRSSATOMSM_syslog(LEVEL, FORMAT)                       \
84         DBGLOG(LEVEL) syslog(LEVEL, "IO[%ld][%ld]RSSP" FORMAT,  \
85                              IO->ID, N)
86
87 /*
88  * Convert an RDF/RSS datestamp into a time_t
89  */
90 time_t rdf_parsedate(const char *p)
91 {
92         struct tm tm;
93         time_t t = 0;
94
95         if (!p) return 0L;
96         if (strlen(p) < 10) return 0L;
97
98         memset(&tm, 0, sizeof tm);
99
100         /*
101          * If the timestamp appears to be in W3C datetime format, try to
102          * parse it.  See also: http://www.w3.org/TR/NOTE-datetime
103          *
104          * This code, along with parsedate.c, is a potential candidate for
105          * moving into libcitadel.
106          */
107         if ( (p[4] == '-') && (p[7] == '-') ) {
108                 tm.tm_year = atoi(&p[0]) - 1900;
109                 tm.tm_mon = atoi(&p[5]) - 1;
110                 tm.tm_mday = atoi(&p[8]);
111                 if ( (p[10] == 'T') && (p[13] == ':') ) {
112                         tm.tm_hour = atoi(&p[11]);
113                         tm.tm_min = atoi(&p[14]);
114                 }
115                 return mktime(&tm);
116         }
117
118         /* hmm... try RFC822 date stamp format */
119
120         t = parsedate(p);
121         if (t > 0) return(t);
122
123         /* yeesh.  ok, just return the current date and time. */
124         return(time(NULL));
125 }
126
127 void flush_rss_item(rss_item *ri)
128 {
129         /* Initialize the feed item data structure */
130         FreeStrBuf(&ri->guid);
131         FreeStrBuf(&ri->title);
132         FreeStrBuf(&ri->link);
133         FreeStrBuf(&ri->author_or_creator);
134         FreeStrBuf(&ri->author_email);
135         FreeStrBuf(&ri->author_url);
136         FreeStrBuf(&ri->description);
137
138         FreeStrBuf(&ri->linkTitle);
139         FreeStrBuf(&ri->reLink);
140         FreeStrBuf(&ri->reLinkTitle);
141         FreeStrBuf(&ri->channel_title);
142 }
143
144
145 /******************************************************************************
146  *                              XML-Handler                                   *
147  ******************************************************************************/
148
149
150 void RSS_item_rss_start (StrBuf *CData,
151                          rss_item *ri,
152                          rss_aggregator *RSSAggr,
153                          const char** Attr)
154 {
155         AsyncIO         *IO = &RSSAggr->IO;
156         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an RSS feed.\n");
157         RSSAggr->ItemType = RSS_RSS;
158 }
159
160 void RSS_item_rdf_start(StrBuf *CData,
161                         rss_item *ri,
162                         rss_aggregator *RSSAggr,
163                         const char** Attr)
164 {
165         AsyncIO         *IO = &RSSAggr->IO;
166         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an RDF feed.\n");
167         RSSAggr->ItemType = RSS_RSS;
168 }
169
170 void ATOM_item_feed_start(StrBuf *CData,
171                           rss_item *ri,
172                           rss_aggregator *RSSAggr,
173                           const char** Attr)
174 {
175         AsyncIO         *IO = &RSSAggr->IO;
176         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an ATOM feed.\n");
177         RSSAggr->ItemType = RSS_ATOM;
178 }
179
180
181 void RSS_item_item_start(StrBuf *CData,
182                          rss_item *ri,
183                          rss_aggregator *RSSAggr,
184                          const char** Attr)
185 {
186         ri->item_tag_nesting ++;
187         flush_rss_item(ri);
188 }
189
190 void ATOM_item_entry_start(StrBuf *CData,
191                            rss_item *ri,
192                            rss_aggregator *RSSAggr,
193                            const char** Attr)
194 {
195 /* Atom feed... */
196         ri->item_tag_nesting ++;
197         flush_rss_item(ri);
198 }
199
200 void ATOM_item_link_start (StrBuf *CData,
201                            rss_item *ri,
202                            rss_aggregator *RSSAggr,
203                            const char** Attr)
204 {
205         int i;
206         const char *pHref = NULL;
207         const char *pType = NULL;
208         const char *pRel = NULL;
209         const char *pTitle = NULL;
210
211         for (i = 0; Attr[i] != NULL; i+=2)
212         {
213                 if (!strcmp(Attr[i], "href"))
214                 {
215                         pHref = Attr[i+1];
216                 }
217                 else if (!strcmp(Attr[i], "rel"))
218                 {
219                         pRel = Attr[i+1];
220                 }
221                 else if (!strcmp(Attr[i], "type"))
222                 {
223                         pType = Attr[i+1];
224                 }
225                 else if (!strcmp(Attr[i], "title"))
226                 {
227                         pTitle = Attr[i+1];
228                 }
229         }
230         if (pHref == NULL)
231                 return; /* WHUT? Pointing... where? */
232         if ((pType != NULL) && !strcasecmp(pType, "application/atom+xml"))
233                 return;
234         /* these just point to other rss resources,
235            we're not interested in them. */
236         if (pRel != NULL)
237         {
238                 if (!strcasecmp (pRel, "replies"))
239                 {
240                         NewStrBufDupAppendFlush(&ri->reLink, NULL, pHref, -1);
241                         StrBufTrim(ri->link);
242                         NewStrBufDupAppendFlush(&ri->reLinkTitle,
243                                                 NULL,
244                                                 pTitle,
245                                                 -1);
246                 }
247                 else if (!strcasecmp(pRel, "alternate"))
248                 { /* Alternative representation of this Item... */
249                         NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
250                         StrBufTrim(ri->link);
251                         NewStrBufDupAppendFlush(&ri->linkTitle,
252                                                 NULL,
253                                                 pTitle,
254                                                 -1);
255
256                 }
257 #if 0 /* these are also defined, but dunno what to do with them.. */
258                 else if (!strcasecmp(pRel, "related"))
259                 {
260                 }
261                 else if (!strcasecmp(pRel, "self"))
262                 {
263                 }
264                 else if (!strcasecmp(pRel, "enclosure"))
265                 {/*...reference can get big, and is probably the full article*/
266                 }
267                 else if (!strcasecmp(pRel, "via"))
268                 {/* this article was provided via... */
269                 }
270 #endif
271         }
272         else if (StrLength(ri->link) == 0)
273         {
274                 NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
275                 StrBufTrim(ri->link);
276                 NewStrBufDupAppendFlush(&ri->linkTitle, NULL, pTitle, -1);
277         }
278 }
279
280
281
282
283 void ATOMRSS_item_title_end(StrBuf *CData,
284                             rss_item *ri,
285                             rss_aggregator *RSSAggr,
286                             const char** Attr)
287 {
288         if ((ri->item_tag_nesting == 0) && (StrLength(CData) > 0)) {
289                 NewStrBufDupAppendFlush(&ri->channel_title, CData, NULL, 0);
290                 StrBufTrim(ri->channel_title);
291         }
292 }
293
294 void RSS_item_guid_end(StrBuf *CData,
295                        rss_item *ri,
296                        rss_aggregator *RSSAggr,
297                        const char** Attr)
298 {
299         if (StrLength(CData) > 0) {
300                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
301         }
302 }
303
304 void ATOM_item_id_end(StrBuf *CData,
305                       rss_item *ri, rss_aggregator *RSSAggr, const char** Attr)
306 {
307         if (StrLength(CData) > 0) {
308                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
309         }
310 }
311
312
313 void RSS_item_link_end (StrBuf *CData,
314                         rss_item *ri,
315                         rss_aggregator *RSSAggr,
316                         const char** Attr)
317 {
318         if (StrLength(CData) > 0) {
319                 NewStrBufDupAppendFlush(&ri->link, CData, NULL, 0);
320                 StrBufTrim(ri->link);
321         }
322 }
323 void RSS_item_relink_end(StrBuf *CData,
324                          rss_item *ri,
325                          rss_aggregator *RSSAggr,
326                          const char** Attr)
327 {
328         if (StrLength(CData) > 0) {
329                 NewStrBufDupAppendFlush(&ri->reLink, CData, NULL, 0);
330                 StrBufTrim(ri->reLink);
331         }
332 }
333
334 void RSSATOM_item_title_end (StrBuf *CData,
335                              rss_item *ri,
336                              rss_aggregator *RSSAggr,
337                              const char** Attr)
338 {
339         if (StrLength(CData) > 0) {
340                 NewStrBufDupAppendFlush(&ri->title, CData, NULL, 0);
341                 StrBufTrim(ri->title);
342         }
343 }
344
345 void ATOM_item_content_end (StrBuf *CData,
346                             rss_item *ri,
347                             rss_aggregator *RSSAggr,
348                             const char** Attr)
349 {
350         long olen = StrLength (ri->description);
351         long clen = StrLength (CData);
352         if (clen > 0)
353         {
354                 if (olen == 0) {
355                         NewStrBufDupAppendFlush(&ri->description,
356                                                 CData,
357                                                 NULL,
358                                                 0);
359                         StrBufTrim(ri->description);
360                 }
361                 else if (olen < clen) {
362                         FlushStrBuf(ri->description);
363                         NewStrBufDupAppendFlush(&ri->description,
364                                                 CData,
365                                                 NULL,
366                                                 0);
367
368                         StrBufTrim(ri->description);
369                 }
370         }
371 }
372 void ATOM_item_summary_end (StrBuf *CData,
373                             rss_item *ri,
374                             rss_aggregator *RSSAggr,
375                             const char** Attr)
376 {
377         /*
378          * this can contain an abstract of the article.
379          * but we don't want to verwrite a full document if we already have it.
380          */
381         if ((StrLength(CData) > 0) && (StrLength(ri->description) == 0))
382         {
383                 NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0);
384                 StrBufTrim(ri->description);
385         }
386 }
387
388 void RSS_item_description_end (StrBuf *CData,
389                                rss_item *ri,
390                                rss_aggregator *RSSAggr,
391                                const char** Attr)
392 {
393         long olen = StrLength (ri->description);
394         long clen = StrLength (CData);
395         if (clen > 0)
396         {
397                 if (olen == 0) {
398                         NewStrBufDupAppendFlush(&ri->description,
399                                                 CData,
400                                                 NULL,
401                                                 0);
402                         StrBufTrim(ri->description);
403                 }
404                 else if (olen < clen) {
405                         FlushStrBuf(ri->description);
406                         NewStrBufDupAppendFlush(&ri->description,
407                                                 CData,
408                                                 NULL,
409                                                 0);
410                         StrBufTrim(ri->description);
411                 }
412         }
413 }
414
415 void ATOM_item_published_end (StrBuf *CData,
416                               rss_item *ri,
417                               rss_aggregator *RSSAggr,
418                               const char** Attr)
419 {
420         if (StrLength(CData) > 0) {
421                 StrBufTrim(CData);
422                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
423         }
424 }
425
426 void ATOM_item_updated_end (StrBuf *CData,
427                             rss_item *ri,
428                             rss_aggregator *RSSAggr,
429                             const char** Attr)
430 {
431         if (StrLength(CData) > 0) {
432                 StrBufTrim(CData);
433                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
434         }
435 }
436
437 void RSS_item_pubdate_end (StrBuf *CData,
438                            rss_item *ri,
439                            rss_aggregator *RSSAggr,
440                            const char** Attr)
441 {
442         if (StrLength(CData) > 0) {
443                 StrBufTrim(CData);
444                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
445         }
446 }
447
448
449 void RSS_item_date_end (StrBuf *CData,
450                         rss_item *ri,
451                         rss_aggregator *RSSAggr,
452                         const char** Attr)
453 {
454         if (StrLength(CData) > 0) {
455                 StrBufTrim(CData);
456                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
457         }
458 }
459
460
461
462 void RSS_item_author_end(StrBuf *CData,
463                          rss_item *ri,
464                          rss_aggregator *RSSAggr,
465                          const char** Attr)
466 {
467         if (StrLength(CData) > 0) {
468                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
469                 StrBufTrim(ri->author_or_creator);
470         }
471 }
472
473
474 void ATOM_item_name_end(StrBuf *CData,
475                         rss_item *ri,
476                         rss_aggregator *RSSAggr,
477                         const char** Attr)
478 {
479         if (StrLength(CData) > 0) {
480                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
481                 StrBufTrim(ri->author_or_creator);
482         }
483 }
484
485 void ATOM_item_email_end(StrBuf *CData,
486                          rss_item *ri,
487                          rss_aggregator *RSSAggr,
488                          const char** Attr)
489 {
490         if (StrLength(CData) > 0) {
491                 NewStrBufDupAppendFlush(&ri->author_email, CData, NULL, 0);
492                 StrBufTrim(ri->author_email);
493         }
494 }
495
496 void RSS_item_creator_end(StrBuf *CData,
497                           rss_item *ri,
498                           rss_aggregator *RSSAggr,
499                           const char** Attr)
500 {
501         if ((StrLength(CData) > 0) &&
502             (StrLength(ri->author_or_creator) == 0))
503         {
504                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
505                 StrBufTrim(ri->author_or_creator);
506         }
507 }
508
509
510 void ATOM_item_uri_end(StrBuf *CData,
511                        rss_item *ri,
512                        rss_aggregator *RSSAggr,
513                        const char** Attr)
514 {
515         if (StrLength(CData) > 0) {
516                 NewStrBufDupAppendFlush(&ri->author_url, CData, NULL, 0);
517                 StrBufTrim(ri->author_url);
518         }
519 }
520
521 void RSS_item_item_end(StrBuf *CData,
522                        rss_item *ri,
523                        rss_aggregator *RSSAggr,
524                        const char** Attr)
525 {
526         --ri->item_tag_nesting;
527         rss_remember_item(ri, RSSAggr);
528 }
529
530
531 void ATOM_item_entry_end(StrBuf *CData,
532                          rss_item *ri,
533                          rss_aggregator *RSSAggr,
534                          const char** Attr)
535 {
536         --ri->item_tag_nesting;
537         rss_remember_item(ri, RSSAggr);
538 }
539
540 void RSS_item_rss_end(StrBuf *CData,
541                       rss_item *ri,
542                       rss_aggregator *RSSAggr,
543                       const char** Attr)
544 {
545         AsyncIO         *IO = &RSSAggr->IO;
546         EVRSSATOMM_syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
547         ri->done_parsing = 1;
548 }
549
550 void RSS_item_rdf_end(StrBuf *CData,
551                       rss_item *ri,
552                       rss_aggregator *RSSAggr,
553                       const char** Attr)
554 {
555         AsyncIO         *IO = &RSSAggr->IO;
556         EVRSSATOMM_syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
557         ri->done_parsing = 1;
558 }
559
560
561 void RSSATOM_item_ignore(StrBuf *CData,
562                          rss_item *ri,
563                          rss_aggregator *RSSAggr,
564                          const char** Attr)
565 {
566 }
567
568
569
570 /*
571  * This callback stores up the data which appears in between tags.
572  */
573 void rss_xml_cdata_start(void *data)
574 {
575         rss_aggregator *RSSAggr = (rss_aggregator*) data;
576
577         FlushStrBuf(RSSAggr->CData);
578 }
579
580 void rss_xml_cdata_end(void *data)
581 {
582 }
583 void rss_xml_chardata(void *data, const XML_Char *s, int len)
584 {
585         rss_aggregator *RSSAggr = (rss_aggregator*) data;
586
587         StrBufAppendBufPlain (RSSAggr->CData, s, len, 0);
588 }
589
590
591 /******************************************************************************
592  *                            RSS parser logic                                *
593  ******************************************************************************/
594
595 extern pthread_mutex_t RSSQueueMutex;
596
597 HashList *StartHandlers = NULL;
598 HashList *EndHandlers = NULL;
599 HashList *KnownNameSpaces = NULL;
600
601 void FreeNetworkSaveMessage (void *vMsg)
602 {
603         networker_save_message *Msg = (networker_save_message *) vMsg;
604
605         CtdlFreeMessageContents(&Msg->Msg);
606         FreeStrBuf(&Msg->Message);
607         FreeStrBuf(&Msg->MsgGUID);
608
609         FreeStrBuf(&Msg->author_email);
610         FreeStrBuf(&Msg->author_or_creator);
611         FreeStrBuf(&Msg->title);
612         FreeStrBuf(&Msg->description);
613
614         FreeStrBuf(&Msg->link);
615         FreeStrBuf(&Msg->linkTitle);
616
617         FreeStrBuf(&Msg->reLink);
618         FreeStrBuf(&Msg->reLinkTitle);
619
620         free(Msg);
621 }
622
623
624 /*
625  * Commit a fetched and parsed RSS item to disk
626  */
627 void rss_remember_item(rss_item *ri, rss_aggregator *RSSAggr)
628 {
629         networker_save_message *SaveMsg;
630         struct MD5Context md5context;
631         u_char rawdigest[MD5_DIGEST_LEN];
632         StrBuf *guid;
633         AsyncIO *IO = &RSSAggr->IO;
634         int n;
635
636
637         SaveMsg = (networker_save_message *) malloc(
638                 sizeof(networker_save_message));
639         memset(SaveMsg, 0, sizeof(networker_save_message));
640
641         /* Construct a GUID to use in the S_USETABLE table.
642          * If one is not present in the item itself, make one up.
643          */
644         if (ri->guid != NULL) {
645                 StrBufSpaceToBlank(ri->guid);
646                 StrBufTrim(ri->guid);
647                 guid = NewStrBufPlain(HKEY("rss/"));
648                 StrBufAppendBuf(guid, ri->guid, 0);
649         }
650         else {
651                 MD5Init(&md5context);
652                 if (ri->title != NULL) {
653                         MD5Update(&md5context,
654                                   (const unsigned char*)SKEY(ri->title));
655                 }
656                 if (ri->link != NULL) {
657                         MD5Update(&md5context,
658                                   (const unsigned char*)SKEY(ri->link));
659                 }
660                 MD5Final(rawdigest, &md5context);
661                 guid = NewStrBufPlain(NULL,
662                                       MD5_DIGEST_LEN * 2 + 12 /* _rss2ctdl*/);
663                 StrBufHexEscAppend(guid, NULL, rawdigest, MD5_DIGEST_LEN);
664                 StrBufAppendBufPlain(guid, HKEY("_rss2ctdl"), 0);
665         }
666
667         /* translate Item into message. */
668         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: translating item...\n");
669         if (ri->description == NULL) ri->description = NewStrBufPlain(HKEY(""));
670         StrBufSpaceToBlank(ri->description);
671         SaveMsg->Msg.cm_magic = CTDLMESSAGE_MAGIC;
672         SaveMsg->Msg.cm_anon_type = MES_NORMAL;
673         SaveMsg->Msg.cm_format_type = FMT_RFC822;
674
675         /* gather the cheaply computed information now... */
676
677         if (ri->guid != NULL) {
678                 SaveMsg->Msg.cm_fields['E'] = strdup(ChrPtr(ri->guid));
679         }
680
681         SaveMsg->MsgGUID = guid;
682
683         if (ri->pubdate <= 0) {
684                 ri->pubdate = time(NULL); /// TODO: use event time!
685         }
686         SaveMsg->Msg.cm_fields['T'] = malloc(64);
687         snprintf(SaveMsg->Msg.cm_fields['T'], 64, "%ld", ri->pubdate);
688         if (ri->channel_title != NULL) {
689                 if (StrLength(ri->channel_title) > 0) {
690                         SaveMsg->Msg.cm_fields['O'] =
691                                 strdup(ChrPtr(ri->channel_title));
692                 }
693         }
694
695         /* remember the ones for defferred processing to save computing power after we know if we realy need it. */
696
697         SaveMsg->author_or_creator = ri->author_or_creator;
698         ri->author_or_creator = NULL;
699
700         SaveMsg->author_email = ri->author_email;
701         ri->author_email = NULL;
702
703         SaveMsg->title = ri->title;
704         ri->title = NULL;
705
706         SaveMsg->link = ri->link;
707         ri->link = NULL;
708
709         SaveMsg->description = ri->description;
710         ri->description = NULL;
711
712         SaveMsg->linkTitle = ri->linkTitle;
713         ri->linkTitle = NULL;
714
715         SaveMsg->reLink = ri->reLink;
716         ri->reLink = NULL;
717
718         SaveMsg->reLinkTitle = ri->reLinkTitle;
719         ri->reLinkTitle = NULL;
720
721         n = GetCount(RSSAggr->Messages) + 1;
722         Put(RSSAggr->Messages, IKEY(n), SaveMsg, FreeNetworkSaveMessage);
723 }
724
725
726
727 void rss_xml_start(void *data, const char *supplied_el, const char **attr)
728 {
729         rss_xml_handler *h;
730         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
731         AsyncIO         *IO = &RSSAggr->IO;
732         rss_item        *ri = RSSAggr->Item;
733         void            *pv;
734         const char      *pel;
735         char            *sep = NULL;
736
737         /* Axe the namespace, we don't care about it */
738         /*
739           syslog(LOG_DEBUG,
740           "RSS: supplied el %d: %s\n", RSSAggr->RSSAggr->ItemType, supplied_el);
741         */
742         pel = supplied_el;
743         while (sep = strchr(pel, ':'), sep) {
744                 pel = sep + 1;
745         }
746
747         if (pel != supplied_el)
748         {
749                 void *v;
750
751                 if (!GetHash(KnownNameSpaces,
752                              supplied_el,
753                              pel - supplied_el - 1,
754                              &v))
755                 {
756                         EVRSSATOM_syslog(LOG_DEBUG,
757                                          "RSS: START ignoring "
758                                          "because of wrong namespace [%s]\n",
759                                          supplied_el);
760                         return;
761                 }
762         }
763
764         StrBufPlain(RSSAggr->Key, pel, -1);
765         StrBufLowerCase(RSSAggr->Key);
766         if (GetHash(StartHandlers, SKEY(RSSAggr->Key), &pv))
767         {
768                 h = (rss_xml_handler*) pv;
769
770                 if (((h->Flags & RSS_UNSET) != 0) &&
771                     (RSSAggr->ItemType == RSS_UNSET))
772                 {
773                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
774                 }
775                 else if (((h->Flags & RSS_RSS) != 0) &&
776                     (RSSAggr->ItemType == RSS_RSS))
777                 {
778                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
779                 }
780                 else if (((h->Flags & RSS_ATOM) != 0) &&
781                          (RSSAggr->ItemType == RSS_ATOM))
782                 {
783                         h->Handler(RSSAggr->CData,
784                                    ri,
785                                    RSSAggr,
786                                    attr);
787                 }
788                 else
789                         EVRSSATOM_syslog(LOG_DEBUG,
790                                           "RSS: START unhandled: [%s] [%s]...\n",
791                                          pel,
792                                          supplied_el);
793         }
794         else
795                 EVRSSATOM_syslog(LOG_DEBUG,
796                                  "RSS: START unhandled: [%s] [%s]...\n",
797                                  pel,
798                                  supplied_el);
799 }
800
801 void rss_xml_end(void *data, const char *supplied_el)
802 {
803         rss_xml_handler *h;
804         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
805         AsyncIO         *IO = &RSSAggr->IO;
806         rss_item        *ri = RSSAggr->Item;
807         const char      *pel;
808         char            *sep = NULL;
809         void            *pv;
810
811         /* Axe the namespace, we don't care about it */
812         pel = supplied_el;
813         while (sep = strchr(pel, ':'), sep) {
814                 pel = sep + 1;
815         }
816         EVRSSATOM_syslog(LOG_DEBUG, "RSS: END %s...\n", supplied_el);
817         if (pel != supplied_el)
818         {
819                 void *v;
820
821                 if (!GetHash(KnownNameSpaces,
822                              supplied_el,
823                              pel - supplied_el - 1,
824                              &v))
825                 {
826                         EVRSSATOM_syslog(LOG_DEBUG,
827                                          "RSS: END ignoring because of wrong namespace"
828                                          "[%s] = [%s]\n",
829                                          supplied_el,
830                                          ChrPtr(RSSAggr->CData));
831                         FlushStrBuf(RSSAggr->CData);
832                         return;
833                 }
834         }
835
836         StrBufPlain(RSSAggr->Key, pel, -1);
837         StrBufLowerCase(RSSAggr->Key);
838         if (GetHash(EndHandlers, SKEY(RSSAggr->Key), &pv))
839         {
840                 h = (rss_xml_handler*) pv;
841
842                 if (((h->Flags & RSS_UNSET) != 0) &&
843                     (RSSAggr->ItemType == RSS_UNSET))
844                 {
845                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
846                 }
847                 else if (((h->Flags & RSS_RSS) != 0) &&
848                     (RSSAggr->ItemType == RSS_RSS))
849                 {
850                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
851                 }
852                 else if (((h->Flags & RSS_ATOM) != 0) &&
853                          (RSSAggr->ItemType == RSS_ATOM))
854                 {
855                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
856                 }
857                 else
858                         EVRSSATOM_syslog(LOG_DEBUG,
859                                          "RSS: END   unhandled: [%s]  [%s] = [%s]...\n",
860                                          pel,
861                                          supplied_el,
862                                          ChrPtr(RSSAggr->CData));
863         }
864         else
865                 EVRSSATOM_syslog(LOG_DEBUG,
866                                  "RSS: END   unhandled: [%s]  [%s] = [%s]...\n",
867                                  pel,
868                                  supplied_el,
869                                  ChrPtr(RSSAggr->CData));
870         FlushStrBuf(RSSAggr->CData);
871 }
872
873
874
875 /*
876  * Callback function for passing libcurl's output to expat for parsing
877  * we don't do streamed parsing so expat can handle non-utf8 documents
878 size_t rss_libcurl_callback(void *ptr, size_t size, size_t nmemb, void *stream)
879 {
880         XML_Parse((XML_Parser)stream, ptr, (size * nmemb), 0);
881         return (size*nmemb);
882 }
883  */
884
885
886
887 eNextState RSSAggregator_ParseReply(AsyncIO *IO)
888 {
889         StrBuf *Buf;
890         rss_aggregator *RSSAggr;
891         rss_item *ri;
892         const char *at;
893         char *ptr;
894         long len;
895         const char *Key;
896
897         RSSAggr = IO->Data;
898         ri = RSSAggr->Item;
899         RSSAggr->CData = NewStrBufPlain(NULL, SIZ);
900         RSSAggr->Key = NewStrBuf();
901         at = NULL;
902         StrBufSipLine(RSSAggr->Key, IO->HttpReq.ReplyData, &at);
903         ptr = NULL;
904
905 #define encoding "encoding=\""
906         ptr = strstr(ChrPtr(RSSAggr->Key), encoding);
907         if (ptr != NULL)
908         {
909                 char *pche;
910
911                 ptr += sizeof (encoding) - 1;
912                 pche = strchr(ptr, '"');
913                 if (pche != NULL)
914                         StrBufCutAt(RSSAggr->Key, -1, pche);
915                 else
916                         ptr = "UTF-8";
917         }
918         else
919                 ptr = "UTF-8";
920
921         EVRSSATOM_syslog(LOG_DEBUG, "RSS: Now parsing [%s] \n", ChrPtr(RSSAggr->Url));
922
923         RSSAggr->xp = XML_ParserCreateNS(ptr, ':');
924         if (!RSSAggr->xp) {
925                 EVRSSATOMM_syslog(LOG_ALERT, "Cannot create XML parser!\n");
926                 return eAbort;
927         }
928         FlushStrBuf(RSSAggr->Key);
929
930         RSSAggr->Messages = NewHash(1, Flathash);
931         XML_SetElementHandler(RSSAggr->xp, rss_xml_start, rss_xml_end);
932         XML_SetCharacterDataHandler(RSSAggr->xp, rss_xml_chardata);
933         XML_SetUserData(RSSAggr->xp, RSSAggr);
934         XML_SetCdataSectionHandler(RSSAggr->xp,
935                                    rss_xml_cdata_start,
936                                    rss_xml_cdata_end);
937
938
939         len = StrLength(IO->HttpReq.ReplyData);
940         ptr = SmashStrBuf(&IO->HttpReq.ReplyData);
941         XML_Parse(RSSAggr->xp, ptr, len, 0);
942         free (ptr);
943         if (ri->done_parsing == 0)
944                 XML_Parse(RSSAggr->xp, "", 0, 1);
945
946
947         EVRSSATOM_syslog(LOG_DEBUG, "RSS: XML Status [%s] \n",
948                          XML_ErrorString(XML_GetErrorCode(RSSAggr->xp)));
949
950         XML_ParserFree(RSSAggr->xp);
951         flush_rss_item(ri);
952
953         Buf = NewStrBufDup(RSSAggr->rooms);
954         RSSAggr->recp.recp_room = SmashStrBuf(&Buf);
955         RSSAggr->recp.num_room = RSSAggr->roomlist_parts;
956         RSSAggr->recp.recptypes_magic = RECPTYPES_MAGIC;
957
958         RSSAggr->Pos = GetNewHashPos(RSSAggr->Messages, 1);
959
960 //RSSAggr->next_poll = time(NULL) + config.c_net_freq;
961         if (GetNextHashPos(RSSAggr->Messages,
962                            RSSAggr->Pos,
963                            &len,
964                            &Key,
965                            (void**) &RSSAggr->ThisMsg))
966                 return NextDBOperation(IO, RSS_FetchNetworkUsetableEntry);
967         else
968                 return eAbort;
969 }
970
971
972 /******************************************************************************
973  *                    RSS handler registering logic                           *
974  ******************************************************************************/
975
976 void AddRSSStartHandler(rss_handler_func Handler,
977                         int Flags,
978                         const char *key,
979                         long len)
980 {
981         rss_xml_handler *h;
982         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
983         h->Flags = Flags;
984         h->Handler = Handler;
985         Put(StartHandlers, key, len, h, NULL);
986 }
987
988 void AddRSSEndHandler(rss_handler_func Handler,
989                       int Flags,
990                       const char *key,
991                       long len)
992 {
993         rss_xml_handler *h;
994         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
995         h->Flags = Flags;
996         h->Handler = Handler;
997         Put(EndHandlers, key, len, h, NULL);
998 }
999
1000 void rss_parser_cleanup(void)
1001 {
1002         DeleteHash(&StartHandlers);
1003         DeleteHash(&EndHandlers);
1004         DeleteHash(&KnownNameSpaces);
1005 }
1006
1007 void LogDebugEnableRSSATOMParser(const int n)
1008 {
1009         RSSAtomParserDebugEnabled = n;
1010 }
1011
1012 CTDL_MODULE_INIT(rssparser)
1013 {
1014         if (!threading)
1015         {
1016                 StartHandlers = NewHash(1, NULL);
1017                 EndHandlers = NewHash(1, NULL);
1018
1019                 AddRSSStartHandler(RSS_item_rss_start,     RSS_UNSET, HKEY("rss"));
1020                 AddRSSStartHandler(RSS_item_rdf_start,     RSS_UNSET, HKEY("rdf"));
1021                 AddRSSStartHandler(ATOM_item_feed_start,   RSS_UNSET, HKEY("feed"));
1022                 AddRSSStartHandler(RSS_item_item_start,    RSS_RSS, HKEY("item"));
1023                 AddRSSStartHandler(ATOM_item_entry_start,  RSS_ATOM, HKEY("entry"));
1024                 AddRSSStartHandler(ATOM_item_link_start,   RSS_ATOM, HKEY("link"));
1025
1026                 AddRSSEndHandler(ATOMRSS_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
1027                 AddRSSEndHandler(RSS_item_guid_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("guid"));
1028                 AddRSSEndHandler(ATOM_item_id_end,         RSS_ATOM|RSS_REQUIRE_BUF, HKEY("id"));
1029                 AddRSSEndHandler(RSS_item_link_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("link"));
1030 #if 0
1031 // hm, rss to the comments of that blog, might be interesting in future, but...
1032                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("commentrss"));
1033 // comment count...
1034                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("comments"));
1035 #endif
1036                 AddRSSEndHandler(RSSATOM_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
1037                 AddRSSEndHandler(ATOM_item_content_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("content"));
1038                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_ATOM|RSS_REQUIRE_BUF, HKEY("encoded"));
1039                 AddRSSEndHandler(ATOM_item_summary_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("summary"));
1040                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_REQUIRE_BUF, HKEY("description"));
1041                 AddRSSEndHandler(ATOM_item_published_end,  RSS_ATOM|RSS_REQUIRE_BUF, HKEY("published"));
1042                 AddRSSEndHandler(ATOM_item_updated_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("updated"));
1043                 AddRSSEndHandler(RSS_item_pubdate_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("pubdate"));
1044                 AddRSSEndHandler(RSS_item_date_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("date"));
1045                 AddRSSEndHandler(RSS_item_author_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("author"));
1046                 AddRSSEndHandler(RSS_item_creator_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("creator"));
1047 /* <author> */
1048                 AddRSSEndHandler(ATOM_item_email_end,      RSS_ATOM|RSS_REQUIRE_BUF, HKEY("email"));
1049                 AddRSSEndHandler(ATOM_item_name_end,       RSS_ATOM|RSS_REQUIRE_BUF, HKEY("name"));
1050                 AddRSSEndHandler(ATOM_item_uri_end,        RSS_ATOM|RSS_REQUIRE_BUF, HKEY("uri"));
1051 /* </author> */
1052                 AddRSSEndHandler(RSS_item_item_end,        RSS_RSS, HKEY("item"));
1053                 AddRSSEndHandler(RSS_item_rss_end,         RSS_RSS, HKEY("rss"));
1054                 AddRSSEndHandler(RSS_item_rdf_end,         RSS_RSS, HKEY("rdf"));
1055                 AddRSSEndHandler(ATOM_item_entry_end,      RSS_ATOM, HKEY("entry"));
1056
1057
1058 /* at the start of atoms: <seq> <li>link to resource</li></seq> ignore them. */
1059                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
1060                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
1061                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
1062                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
1063
1064 /* links to other feed generators... */
1065                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
1066                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
1067                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
1068                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
1069
1070                 KnownNameSpaces = NewHash(1, NULL);
1071                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearch/1.1/"), NULL, reference_free_handler);
1072                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearchrss/1.0/"), NULL, reference_free_handler);
1073                 Put(KnownNameSpaces, HKEY("http://backend.userland.com/creativeCommonsRssModule"), NULL, reference_free_handler);
1074                 Put(KnownNameSpaces, HKEY("http://purl.org/atom/ns#"), NULL, reference_free_handler);
1075                 Put(KnownNameSpaces, HKEY("http://purl.org/dc/elements/1.1/"), NULL, reference_free_handler);
1076                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
1077                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/content/"), NULL, reference_free_handler);
1078                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/slash/"), NULL, reference_free_handler);
1079                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/syndication/"), NULL, reference_free_handler);
1080                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
1081                 Put(KnownNameSpaces, HKEY("http://purl.org/syndication/thread/1.0"), NULL, reference_free_handler);
1082                 Put(KnownNameSpaces, HKEY("http://rssnamespace.org/feedburner/ext/1.0"), NULL, reference_free_handler);
1083                 Put(KnownNameSpaces, HKEY("http://schemas.google.com/g/2005"), NULL, reference_free_handler);
1084                 Put(KnownNameSpaces, HKEY("http://webns.net/mvcb/"), NULL, reference_free_handler);
1085                 Put(KnownNameSpaces, HKEY("http://web.resource.org/cc/"), NULL, reference_free_handler);
1086                 Put(KnownNameSpaces, HKEY("http://wellformedweb.org/CommentAPI/"), NULL, reference_free_handler);
1087                 Put(KnownNameSpaces, HKEY("http://www.georss.org/georss"), NULL, reference_free_handler);
1088                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/xhtml"), NULL, reference_free_handler);
1089                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1090                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1091                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2003/01/geo/wgs84_pos#"), NULL, reference_free_handler);
1092                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2005/Atom"), NULL, reference_free_handler);
1093                 Put(KnownNameSpaces, HKEY("urn:flickr:"), NULL, reference_free_handler);
1094 #if 0
1095                 /* we don't like these namespaces because of they shadow our usefull parameters. */
1096                 Put(KnownNameSpaces, HKEY("http://search.yahoo.com/mrss/"), NULL, reference_free_handler);
1097 #endif
1098                 CtdlRegisterDebugFlagHook(HKEY("RSSAtomParser"), LogDebugEnableRSSATOMParser, &RSSAtomParserDebugEnabled);
1099                 CtdlRegisterCleanupHook(rss_parser_cleanup);
1100         }
1101         return "rssparser";
1102 }