fdba27577fb0008f61f16e94b02ade85d377588f
[citadel.git] / citadel / modules / rssclient / rss_atom_parser.c
1 /*
2  * Bring external RSS feeds into rooms.
3  *
4  * Copyright (c) 2007-2015 by the citadel.org team
5  *
6  * This program is open source software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 3.
8  * 
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  */
14
15 #include <stdlib.h>
16 #include <unistd.h>
17 #include <stdio.h>
18
19 #if TIME_WITH_SYS_TIME
20 # include <sys/time.h>
21 # include <time.h>
22 #else
23 # if HAVE_SYS_TIME_H
24 #  include <sys/time.h>
25 # else
26 #  include <time.h>
27 # endif
28 #endif
29
30 #include <ctype.h>
31 #include <string.h>
32 #include <errno.h>
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <expat.h>
36 #include <curl/curl.h>
37 #include <libcitadel.h>
38 #include "citadel.h"
39 #include "server.h"
40 #include "citserver.h"
41 #include "support.h"
42 #include "config.h"
43 #include "threads.h"
44 #include "ctdl_module.h"
45 #include "clientsocket.h"
46 #include "msgbase.h"
47 #include "parsedate.h"
48 #include "database.h"
49 #include "citadel_dirs.h"
50 #include "md5.h"
51 #include "context.h"
52 #include "event_client.h"
53 #include "rss_atom_parser.h"
54
55 void rss_remember_item(rss_item *ri, rss_aggregator *Cfg);
56
57 int RSSAtomParserDebugEnabled = 0;
58
59 #define N ((rss_aggregator*)IO->Data)->Cfg.QRnumber
60
61 #define DBGLOG(LEVEL) if ((LEVEL != LOG_DEBUG) || (RSSAtomParserDebugEnabled != 0))
62
63 #define EVRSSATOM_syslog(LEVEL, FORMAT, ...)                            \
64         DBGLOG(LEVEL) syslog(LEVEL,                                     \
65                              "%s[%ld]CC[%d][%ld]RSSP" FORMAT,           \
66                              IOSTR, IO->ID, CCID, N, __VA_ARGS__)
67
68 #define EVRSSATOMM_syslog(LEVEL, FORMAT)                                \
69         DBGLOG(LEVEL) syslog(LEVEL,                                     \
70                              "%s[%ld]CC[%d][%ld]RSSP" FORMAT,           \
71                              IOSTR, IO->ID, CCID, N)
72
73 #define EVRSSATOMCS_syslog(LEVEL, FORMAT, ...)                  \
74         DBGLOG(LEVEL) syslog(LEVEL, "%s[%ld][%ld]RSSP" FORMAT,  \
75                              IOSTR, IO->ID, N, __VA_ARGS__)
76
77 #define EVRSSATOMSM_syslog(LEVEL, FORMAT)                       \
78         DBGLOG(LEVEL) syslog(LEVEL, "%s[%ld][%ld]RSSP" FORMAT,  \
79                              IOSTR, IO->ID, N)
80
81 /*
82  * Convert an RDF/RSS datestamp into a time_t
83  */
84 time_t rdf_parsedate(const char *p)
85 {
86         struct tm tm;
87         time_t t = 0;
88
89         if (!p) return 0L;
90         if (strlen(p) < 10) return 0L;
91
92         memset(&tm, 0, sizeof tm);
93
94         /*
95          * If the timestamp appears to be in W3C datetime format, try to
96          * parse it.  See also: http://www.w3.org/TR/NOTE-datetime
97          *
98          * This code, along with parsedate.c, is a potential candidate for
99          * moving into libcitadel.
100          */
101         if ( (p[4] == '-') && (p[7] == '-') ) {
102                 tm.tm_year = atoi(&p[0]) - 1900;
103                 tm.tm_mon = atoi(&p[5]) - 1;
104                 tm.tm_mday = atoi(&p[8]);
105                 if ( (p[10] == 'T') && (p[13] == ':') ) {
106                         tm.tm_hour = atoi(&p[11]);
107                         tm.tm_min = atoi(&p[14]);
108                 }
109                 return mktime(&tm);
110         }
111
112         /* hmm... try RFC822 date stamp format */
113
114         t = parsedate(p);
115         if (t > 0) return(t);
116
117         /* yeesh.  ok, just return the current date and time. */
118         return(time(NULL));
119 }
120
121 void flush_rss_item(rss_item *ri)
122 {
123         /* Initialize the feed item data structure */
124         FreeStrBuf(&ri->guid);
125         FreeStrBuf(&ri->title);
126         FreeStrBuf(&ri->link);
127         FreeStrBuf(&ri->author_or_creator);
128         FreeStrBuf(&ri->author_email);
129         FreeStrBuf(&ri->author_url);
130         FreeStrBuf(&ri->description);
131
132         FreeStrBuf(&ri->linkTitle);
133         FreeStrBuf(&ri->reLink);
134         FreeStrBuf(&ri->reLinkTitle);
135         FreeStrBuf(&ri->channel_title);
136 }
137
138
139 /******************************************************************************
140  *                              XML-Handler                                   *
141  ******************************************************************************/
142
143
144 void RSS_item_rss_start (StrBuf *CData,
145                          rss_item *ri,
146                          rss_aggregator *RSSAggr,
147                          const char** Attr)
148 {
149         AsyncIO         *IO = &RSSAggr->IO;
150         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an RSS feed.\n");
151         RSSAggr->ItemType = RSS_RSS;
152 }
153
154 void RSS_item_rdf_start(StrBuf *CData,
155                         rss_item *ri,
156                         rss_aggregator *RSSAggr,
157                         const char** Attr)
158 {
159         AsyncIO         *IO = &RSSAggr->IO;
160         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an RDF feed.\n");
161         RSSAggr->ItemType = RSS_RSS;
162 }
163
164 void ATOM_item_feed_start(StrBuf *CData,
165                           rss_item *ri,
166                           rss_aggregator *RSSAggr,
167                           const char** Attr)
168 {
169         AsyncIO         *IO = &RSSAggr->IO;
170         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an ATOM feed.\n");
171         RSSAggr->ItemType = RSS_ATOM;
172 }
173
174
175 void RSS_item_item_start(StrBuf *CData,
176                          rss_item *ri,
177                          rss_aggregator *RSSAggr,
178                          const char** Attr)
179 {
180         ri->item_tag_nesting ++;
181         flush_rss_item(ri);
182 }
183
184 void ATOM_item_entry_start(StrBuf *CData,
185                            rss_item *ri,
186                            rss_aggregator *RSSAggr,
187                            const char** Attr)
188 {
189 /* Atom feed... */
190         ri->item_tag_nesting ++;
191         flush_rss_item(ri);
192 }
193
194 void ATOM_item_link_start (StrBuf *CData,
195                            rss_item *ri,
196                            rss_aggregator *RSSAggr,
197                            const char** Attr)
198 {
199         int i;
200         const char *pHref = NULL;
201         const char *pType = NULL;
202         const char *pRel = NULL;
203         const char *pTitle = NULL;
204
205         for (i = 0; Attr[i] != NULL; i+=2)
206         {
207                 if (!strcmp(Attr[i], "href"))
208                 {
209                         pHref = Attr[i+1];
210                 }
211                 else if (!strcmp(Attr[i], "rel"))
212                 {
213                         pRel = Attr[i+1];
214                 }
215                 else if (!strcmp(Attr[i], "type"))
216                 {
217                         pType = Attr[i+1];
218                 }
219                 else if (!strcmp(Attr[i], "title"))
220                 {
221                         pTitle = Attr[i+1];
222                 }
223         }
224         if (pHref == NULL)
225                 return; /* WHUT? Pointing... where? */
226         if ((pType != NULL) && !strcasecmp(pType, "application/atom+xml"))
227                 return;
228         /* these just point to other rss resources,
229            we're not interested in them. */
230         if (pRel != NULL)
231         {
232                 if (!strcasecmp (pRel, "replies"))
233                 {
234                         NewStrBufDupAppendFlush(&ri->reLink, NULL, pHref, -1);
235                         StrBufTrim(ri->link);
236                         NewStrBufDupAppendFlush(&ri->reLinkTitle,
237                                                 NULL,
238                                                 pTitle,
239                                                 -1);
240                 }
241                 else if (!strcasecmp(pRel, "alternate"))
242                 { /* Alternative representation of this Item... */
243                         NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
244                         StrBufTrim(ri->link);
245                         NewStrBufDupAppendFlush(&ri->linkTitle,
246                                                 NULL,
247                                                 pTitle,
248                                                 -1);
249
250                 }
251 #if 0 /* these are also defined, but dunno what to do with them.. */
252                 else if (!strcasecmp(pRel, "related"))
253                 {
254                 }
255                 else if (!strcasecmp(pRel, "self"))
256                 {
257                 }
258                 else if (!strcasecmp(pRel, "enclosure"))
259                 {/*...reference can get big, and is probably the full article*/
260                 }
261                 else if (!strcasecmp(pRel, "via"))
262                 {/* this article was provided via... */
263                 }
264 #endif
265         }
266         else if (StrLength(ri->link) == 0)
267         {
268                 NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
269                 StrBufTrim(ri->link);
270                 NewStrBufDupAppendFlush(&ri->linkTitle, NULL, pTitle, -1);
271         }
272 }
273
274
275
276
277 void ATOMRSS_item_title_end(StrBuf *CData,
278                             rss_item *ri,
279                             rss_aggregator *RSSAggr,
280                             const char** Attr)
281 {
282         if ((ri->item_tag_nesting == 0) && (StrLength(CData) > 0)) {
283                 NewStrBufDupAppendFlush(&ri->channel_title, CData, NULL, 0);
284                 StrBufTrim(ri->channel_title);
285         }
286 }
287
288 void RSS_item_guid_end(StrBuf *CData,
289                        rss_item *ri,
290                        rss_aggregator *RSSAggr,
291                        const char** Attr)
292 {
293         if (StrLength(CData) > 0) {
294                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
295         }
296 }
297
298 void ATOM_item_id_end(StrBuf *CData,
299                       rss_item *ri, rss_aggregator *RSSAggr, const char** Attr)
300 {
301         if (StrLength(CData) > 0) {
302                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
303         }
304 }
305
306
307 void RSS_item_link_end (StrBuf *CData,
308                         rss_item *ri,
309                         rss_aggregator *RSSAggr,
310                         const char** Attr)
311 {
312         if (StrLength(CData) > 0) {
313                 NewStrBufDupAppendFlush(&ri->link, CData, NULL, 0);
314                 StrBufTrim(ri->link);
315         }
316 }
317 void RSS_item_relink_end(StrBuf *CData,
318                          rss_item *ri,
319                          rss_aggregator *RSSAggr,
320                          const char** Attr)
321 {
322         if (StrLength(CData) > 0) {
323                 NewStrBufDupAppendFlush(&ri->reLink, CData, NULL, 0);
324                 StrBufTrim(ri->reLink);
325         }
326 }
327
328 void RSSATOM_item_title_end (StrBuf *CData,
329                              rss_item *ri,
330                              rss_aggregator *RSSAggr,
331                              const char** Attr)
332 {
333         if (StrLength(CData) > 0) {
334                 NewStrBufDupAppendFlush(&ri->title, CData, NULL, 0);
335                 StrBufTrim(ri->title);
336         }
337 }
338
339 void ATOM_item_content_end (StrBuf *CData,
340                             rss_item *ri,
341                             rss_aggregator *RSSAggr,
342                             const char** Attr)
343 {
344         long olen = StrLength (ri->description);
345         long clen = StrLength (CData);
346         if (clen > 0)
347         {
348                 if (olen == 0) {
349                         NewStrBufDupAppendFlush(&ri->description,
350                                                 CData,
351                                                 NULL,
352                                                 0);
353                         StrBufTrim(ri->description);
354                 }
355                 else if (olen < clen) {
356                         FlushStrBuf(ri->description);
357                         NewStrBufDupAppendFlush(&ri->description,
358                                                 CData,
359                                                 NULL,
360                                                 0);
361
362                         StrBufTrim(ri->description);
363                 }
364         }
365 }
366 void ATOM_item_summary_end (StrBuf *CData,
367                             rss_item *ri,
368                             rss_aggregator *RSSAggr,
369                             const char** Attr)
370 {
371         /*
372          * this can contain an abstract of the article.
373          * but we don't want to verwrite a full document if we already have it.
374          */
375         if ((StrLength(CData) > 0) && (StrLength(ri->description) == 0))
376         {
377                 NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0);
378                 StrBufTrim(ri->description);
379         }
380 }
381
382 void RSS_item_description_end (StrBuf *CData,
383                                rss_item *ri,
384                                rss_aggregator *RSSAggr,
385                                const char** Attr)
386 {
387         long olen = StrLength (ri->description);
388         long clen = StrLength (CData);
389         if (clen > 0)
390         {
391                 if (olen == 0) {
392                         NewStrBufDupAppendFlush(&ri->description,
393                                                 CData,
394                                                 NULL,
395                                                 0);
396                         StrBufTrim(ri->description);
397                 }
398                 else if (olen < clen) {
399                         FlushStrBuf(ri->description);
400                         NewStrBufDupAppendFlush(&ri->description,
401                                                 CData,
402                                                 NULL,
403                                                 0);
404                         StrBufTrim(ri->description);
405                 }
406         }
407 }
408
409 void ATOM_item_published_end (StrBuf *CData,
410                               rss_item *ri,
411                               rss_aggregator *RSSAggr,
412                               const char** Attr)
413 {
414         if (StrLength(CData) > 0) {
415                 StrBufTrim(CData);
416                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
417         }
418 }
419
420 void ATOM_item_updated_end (StrBuf *CData,
421                             rss_item *ri,
422                             rss_aggregator *RSSAggr,
423                             const char** Attr)
424 {
425         if (StrLength(CData) > 0) {
426                 StrBufTrim(CData);
427                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
428         }
429 }
430
431 void RSS_item_pubdate_end (StrBuf *CData,
432                            rss_item *ri,
433                            rss_aggregator *RSSAggr,
434                            const char** Attr)
435 {
436         if (StrLength(CData) > 0) {
437                 StrBufTrim(CData);
438                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
439         }
440 }
441
442
443 void RSS_item_date_end (StrBuf *CData,
444                         rss_item *ri,
445                         rss_aggregator *RSSAggr,
446                         const char** Attr)
447 {
448         if (StrLength(CData) > 0) {
449                 StrBufTrim(CData);
450                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
451         }
452 }
453
454
455
456 void RSS_item_author_end(StrBuf *CData,
457                          rss_item *ri,
458                          rss_aggregator *RSSAggr,
459                          const char** Attr)
460 {
461         if (StrLength(CData) > 0) {
462                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
463                 StrBufTrim(ri->author_or_creator);
464         }
465 }
466
467
468 void ATOM_item_name_end(StrBuf *CData,
469                         rss_item *ri,
470                         rss_aggregator *RSSAggr,
471                         const char** Attr)
472 {
473         if (StrLength(CData) > 0) {
474                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
475                 StrBufTrim(ri->author_or_creator);
476         }
477 }
478
479 void ATOM_item_email_end(StrBuf *CData,
480                          rss_item *ri,
481                          rss_aggregator *RSSAggr,
482                          const char** Attr)
483 {
484         if (StrLength(CData) > 0) {
485                 NewStrBufDupAppendFlush(&ri->author_email, CData, NULL, 0);
486                 StrBufTrim(ri->author_email);
487         }
488 }
489
490 void RSS_item_creator_end(StrBuf *CData,
491                           rss_item *ri,
492                           rss_aggregator *RSSAggr,
493                           const char** Attr)
494 {
495         if ((StrLength(CData) > 0) &&
496             (StrLength(ri->author_or_creator) == 0))
497         {
498                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
499                 StrBufTrim(ri->author_or_creator);
500         }
501 }
502
503
504 void ATOM_item_uri_end(StrBuf *CData,
505                        rss_item *ri,
506                        rss_aggregator *RSSAggr,
507                        const char** Attr)
508 {
509         if (StrLength(CData) > 0) {
510                 NewStrBufDupAppendFlush(&ri->author_url, CData, NULL, 0);
511                 StrBufTrim(ri->author_url);
512         }
513 }
514
515 void RSS_item_item_end(StrBuf *CData,
516                        rss_item *ri,
517                        rss_aggregator *RSSAggr,
518                        const char** Attr)
519 {
520         --ri->item_tag_nesting;
521         rss_remember_item(ri, RSSAggr);
522 }
523
524
525 void ATOM_item_entry_end(StrBuf *CData,
526                          rss_item *ri,
527                          rss_aggregator *RSSAggr,
528                          const char** Attr)
529 {
530         --ri->item_tag_nesting;
531         rss_remember_item(ri, RSSAggr);
532 }
533
534 void RSS_item_rss_end(StrBuf *CData,
535                       rss_item *ri,
536                       rss_aggregator *RSSAggr,
537                       const char** Attr)
538 {
539         AsyncIO         *IO = &RSSAggr->IO;
540         EVRSSATOMM_syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
541         ri->done_parsing = 1;
542 }
543
544 void RSS_item_rdf_end(StrBuf *CData,
545                       rss_item *ri,
546                       rss_aggregator *RSSAggr,
547                       const char** Attr)
548 {
549         AsyncIO         *IO = &RSSAggr->IO;
550         EVRSSATOMM_syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
551         ri->done_parsing = 1;
552 }
553
554
555 void RSSATOM_item_ignore(StrBuf *CData,
556                          rss_item *ri,
557                          rss_aggregator *RSSAggr,
558                          const char** Attr)
559 {
560 }
561
562
563
564 /*
565  * This callback stores up the data which appears in between tags.
566  */
567 void rss_xml_cdata_start(void *data)
568 {
569         rss_aggregator *RSSAggr = (rss_aggregator*) data;
570
571         FlushStrBuf(RSSAggr->CData);
572 }
573
574 void rss_xml_cdata_end(void *data)
575 {
576 }
577 void rss_xml_chardata(void *data, const XML_Char *s, int len)
578 {
579         rss_aggregator *RSSAggr = (rss_aggregator*) data;
580
581         StrBufAppendBufPlain (RSSAggr->CData, s, len, 0);
582 }
583
584
585 /******************************************************************************
586  *                            RSS parser logic                                *
587  ******************************************************************************/
588
589 extern pthread_mutex_t RSSQueueMutex;
590
591 HashList *StartHandlers = NULL;
592 HashList *EndHandlers = NULL;
593 HashList *KnownNameSpaces = NULL;
594
595 void FreeNetworkSaveMessage (void *vMsg)
596 {
597         networker_save_message *Msg = (networker_save_message *) vMsg;
598
599         CM_FreeContents(&Msg->Msg);
600         FreeStrBuf(&Msg->Message);
601         FreeStrBuf(&Msg->MsgGUID);
602
603         FreeStrBuf(&Msg->author_email);
604         FreeStrBuf(&Msg->author_or_creator);
605         FreeStrBuf(&Msg->title);
606         FreeStrBuf(&Msg->description);
607
608         FreeStrBuf(&Msg->link);
609         FreeStrBuf(&Msg->linkTitle);
610
611         FreeStrBuf(&Msg->reLink);
612         FreeStrBuf(&Msg->reLinkTitle);
613
614         free(Msg);
615 }
616
617
618 /*
619  * Commit a fetched and parsed RSS item to disk
620  */
621 void rss_remember_item(rss_item *ri, rss_aggregator *RSSAggr)
622 {
623         networker_save_message *SaveMsg;
624         struct MD5Context md5context;
625         u_char rawdigest[MD5_DIGEST_LEN];
626         StrBuf *guid;
627         AsyncIO *IO = &RSSAggr->IO;
628         int n;
629
630         SaveMsg = (networker_save_message *) malloc(sizeof(networker_save_message));
631         memset(SaveMsg, 0, sizeof(networker_save_message));
632
633         /* Construct a GUID to use in the S_USETABLE table.
634          * If one is not present in the item itself, make one up.
635          */
636         if (ri->guid != NULL) {
637                 StrBufSpaceToBlank(ri->guid);
638                 StrBufTrim(ri->guid);
639                 guid = NewStrBufPlain(HKEY("rss/"));
640                 StrBufAppendBuf(guid, ri->guid, 0);
641         }
642         else {
643                 MD5Init(&md5context);
644                 if (ri->title != NULL) {
645                         MD5Update(&md5context, (const unsigned char*)SKEY(ri->title));
646                 }
647                 if (ri->link != NULL) {
648                         MD5Update(&md5context, (const unsigned char*)SKEY(ri->link));
649                 }
650                 MD5Final(rawdigest, &md5context);
651                 guid = NewStrBufPlain(NULL, MD5_DIGEST_LEN * 2 + 12 /* _rss2ctdl*/);
652                 StrBufHexEscAppend(guid, NULL, rawdigest, MD5_DIGEST_LEN);
653                 StrBufAppendBufPlain(guid, HKEY("_rss2ctdl"), 0);
654         }
655
656         /* translate Item into message. */
657         EVRSSATOMM_syslog(LOG_DEBUG, "RSS: translating item...\n");
658         if (ri->description == NULL) ri->description = NewStrBufPlain(HKEY(""));
659         StrBufSpaceToBlank(ri->description);
660         SaveMsg->Msg.cm_magic = CTDLMESSAGE_MAGIC;
661         SaveMsg->Msg.cm_anon_type = MES_NORMAL;
662         SaveMsg->Msg.cm_format_type = FMT_RFC822;
663
664         /* gather the cheaply computed information now... */
665
666         if (ri->guid != NULL) {
667                 CM_SetField(&SaveMsg->Msg, eExclusiveID, SKEY(ri->guid));
668         }
669
670         SaveMsg->MsgGUID = guid;
671
672         if (ri->pubdate <= 0) {
673                 ri->pubdate = time(NULL);
674         }
675         CM_SetFieldLONG(&SaveMsg->Msg, eTimestamp, ri->pubdate);
676         if (ri->channel_title != NULL) {
677                 if (StrLength(ri->channel_title) > 0) {
678                         CM_SetField(&SaveMsg->Msg, eOriginalRoom, SKEY(ri->channel_title));
679                 }
680         }
681
682         /* remember the ones for defferred processing to save computing power after we know if we realy need it. */
683
684         SaveMsg->author_or_creator = ri->author_or_creator;
685         ri->author_or_creator = NULL;
686
687         SaveMsg->author_email = ri->author_email;
688         ri->author_email = NULL;
689
690         SaveMsg->title = ri->title;
691         ri->title = NULL;
692
693         SaveMsg->link = ri->link;
694         ri->link = NULL;
695
696         SaveMsg->description = ri->description;
697         ri->description = NULL;
698
699         SaveMsg->linkTitle = ri->linkTitle;
700         ri->linkTitle = NULL;
701
702         SaveMsg->reLink = ri->reLink;
703         ri->reLink = NULL;
704
705         SaveMsg->reLinkTitle = ri->reLinkTitle;
706         ri->reLinkTitle = NULL;
707
708         n = GetCount(RSSAggr->Messages) + 1;
709         Put(RSSAggr->Messages, IKEY(n), SaveMsg, FreeNetworkSaveMessage);
710 }
711
712
713
714 void rss_xml_start(void *data, const char *supplied_el, const char **attr)
715 {
716         rss_xml_handler *h;
717         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
718         AsyncIO         *IO = &RSSAggr->IO;
719         rss_item        *ri = RSSAggr->Item;
720         void            *pv;
721         const char      *pel;
722         char            *sep = NULL;
723
724         /* Axe the namespace, we don't care about it */
725         /*
726           syslog(LOG_DEBUG,
727           "RSS: supplied el %d: %s\n", RSSAggr->RSSAggr->ItemType, supplied_el);
728         */
729         pel = supplied_el;
730         while (sep = strchr(pel, ':'), sep) {
731                 pel = sep + 1;
732         }
733
734         if (pel != supplied_el)
735         {
736                 void *v;
737
738                 if (!GetHash(KnownNameSpaces,
739                              supplied_el,
740                              pel - supplied_el - 1,
741                              &v))
742                 {
743                         EVRSSATOM_syslog(LOG_DEBUG,
744                                          "RSS: START ignoring "
745                                          "because of wrong namespace [%s]\n",
746                                          supplied_el);
747                         return;
748                 }
749         }
750
751         StrBufPlain(RSSAggr->Key, pel, -1);
752         StrBufLowerCase(RSSAggr->Key);
753         if (GetHash(StartHandlers, SKEY(RSSAggr->Key), &pv))
754         {
755                 h = (rss_xml_handler*) pv;
756
757                 if (((h->Flags & RSS_UNSET) != 0) &&
758                     (RSSAggr->ItemType == RSS_UNSET))
759                 {
760                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
761                 }
762                 else if (((h->Flags & RSS_RSS) != 0) &&
763                     (RSSAggr->ItemType == RSS_RSS))
764                 {
765                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
766                 }
767                 else if (((h->Flags & RSS_ATOM) != 0) &&
768                          (RSSAggr->ItemType == RSS_ATOM))
769                 {
770                         h->Handler(RSSAggr->CData,
771                                    ri,
772                                    RSSAggr,
773                                    attr);
774                 }
775                 else
776                         EVRSSATOM_syslog(LOG_DEBUG,
777                                           "RSS: START unhandled: [%s] [%s]...\n",
778                                          pel,
779                                          supplied_el);
780         }
781         else
782                 EVRSSATOM_syslog(LOG_DEBUG,
783                                  "RSS: START unhandled: [%s] [%s]...\n",
784                                  pel,
785                                  supplied_el);
786 }
787
788 void rss_xml_end(void *data, const char *supplied_el)
789 {
790         rss_xml_handler *h;
791         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
792         AsyncIO         *IO = &RSSAggr->IO;
793         rss_item        *ri = RSSAggr->Item;
794         const char      *pel;
795         char            *sep = NULL;
796         void            *pv;
797
798         /* Axe the namespace, we don't care about it */
799         pel = supplied_el;
800         while (sep = strchr(pel, ':'), sep) {
801                 pel = sep + 1;
802         }
803         EVRSSATOM_syslog(LOG_DEBUG, "RSS: END %s...\n", supplied_el);
804         if (pel != supplied_el)
805         {
806                 void *v;
807
808                 if (!GetHash(KnownNameSpaces,
809                              supplied_el,
810                              pel - supplied_el - 1,
811                              &v))
812                 {
813                         EVRSSATOM_syslog(LOG_DEBUG,
814                                          "RSS: END ignoring because of wrong namespace"
815                                          "[%s] = [%s]\n",
816                                          supplied_el,
817                                          ChrPtr(RSSAggr->CData));
818                         FlushStrBuf(RSSAggr->CData);
819                         return;
820                 }
821         }
822
823         StrBufPlain(RSSAggr->Key, pel, -1);
824         StrBufLowerCase(RSSAggr->Key);
825         if (GetHash(EndHandlers, SKEY(RSSAggr->Key), &pv))
826         {
827                 h = (rss_xml_handler*) pv;
828
829                 if (((h->Flags & RSS_UNSET) != 0) &&
830                     (RSSAggr->ItemType == RSS_UNSET))
831                 {
832                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
833                 }
834                 else if (((h->Flags & RSS_RSS) != 0) &&
835                     (RSSAggr->ItemType == RSS_RSS))
836                 {
837                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
838                 }
839                 else if (((h->Flags & RSS_ATOM) != 0) &&
840                          (RSSAggr->ItemType == RSS_ATOM))
841                 {
842                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
843                 }
844                 else
845                         EVRSSATOM_syslog(LOG_DEBUG,
846                                          "RSS: END   unhandled: [%s]  [%s] = [%s]...\n",
847                                          pel,
848                                          supplied_el,
849                                          ChrPtr(RSSAggr->CData));
850         }
851         else
852                 EVRSSATOM_syslog(LOG_DEBUG,
853                                  "RSS: END   unhandled: [%s]  [%s] = [%s]...\n",
854                                  pel,
855                                  supplied_el,
856                                  ChrPtr(RSSAggr->CData));
857         FlushStrBuf(RSSAggr->CData);
858 }
859
860
861
862 /*
863  * Callback function for passing libcurl's output to expat for parsing
864  * we don't do streamed parsing so expat can handle non-utf8 documents
865 size_t rss_libcurl_callback(void *ptr, size_t size, size_t nmemb, void *stream)
866 {
867         XML_Parse((XML_Parser)stream, ptr, (size * nmemb), 0);
868         return (size*nmemb);
869 }
870  */
871
872
873
874 eNextState RSSAggregator_ParseReply(AsyncIO *IO)
875 {
876         StrBuf *Buf;
877         rss_aggregator *RSSAggr;
878         rss_item *ri;
879         const char *at;
880         char *ptr;
881         long len;
882         const char *Key;
883
884         RSSAggr = IO->Data;
885         ri = RSSAggr->Item;
886         RSSAggr->CData = NewStrBufPlain(NULL, SIZ);
887         RSSAggr->Key = NewStrBuf();
888         at = NULL;
889         StrBufSipLine(RSSAggr->Key, IO->HttpReq.ReplyData, &at);
890         ptr = NULL;
891
892 #define encoding "encoding=\""
893         ptr = strstr(ChrPtr(RSSAggr->Key), encoding);
894         if (ptr != NULL)
895         {
896                 char *pche;
897
898                 ptr += sizeof (encoding) - 1;
899                 pche = strchr(ptr, '"');
900                 if (pche != NULL)
901                         StrBufCutAt(RSSAggr->Key, -1, pche);
902                 else
903                         ptr = "UTF-8";
904         }
905         else
906                 ptr = "UTF-8";
907
908         EVRSSATOM_syslog(LOG_DEBUG, "RSS: Now parsing [%s] \n", ChrPtr(RSSAggr->Url));
909
910         RSSAggr->xp = XML_ParserCreateNS(ptr, ':');
911         if (!RSSAggr->xp) {
912                 EVRSSATOMM_syslog(LOG_ALERT, "Cannot create XML parser!\n");
913                 return eAbort;
914         }
915         FlushStrBuf(RSSAggr->Key);
916
917         RSSAggr->Messages = NewHash(1, Flathash);
918         XML_SetElementHandler(RSSAggr->xp, rss_xml_start, rss_xml_end);
919         XML_SetCharacterDataHandler(RSSAggr->xp, rss_xml_chardata);
920         XML_SetUserData(RSSAggr->xp, RSSAggr);
921         XML_SetCdataSectionHandler(RSSAggr->xp,
922                                    rss_xml_cdata_start,
923                                    rss_xml_cdata_end);
924
925
926         len = StrLength(IO->HttpReq.ReplyData);
927         ptr = SmashStrBuf(&IO->HttpReq.ReplyData);
928         XML_Parse(RSSAggr->xp, ptr, len, 0);
929         free (ptr);
930         if (ri->done_parsing == 0)
931                 XML_Parse(RSSAggr->xp, "", 0, 1);
932
933
934         EVRSSATOM_syslog(LOG_DEBUG, "RSS: XML Status [%s] \n",
935                          XML_ErrorString(XML_GetErrorCode(RSSAggr->xp)));
936
937         XML_ParserFree(RSSAggr->xp);
938         flush_rss_item(ri);
939
940         Buf = NewStrBufDup(RSSAggr->rooms);
941         RSSAggr->recp.recp_room = SmashStrBuf(&Buf);
942         RSSAggr->recp.num_room = RSSAggr->roomlist_parts;
943         RSSAggr->recp.recptypes_magic = RECPTYPES_MAGIC;
944
945         RSSAggr->Pos = GetNewHashPos(RSSAggr->Messages, 1);
946
947         if (GetNextHashPos(RSSAggr->Messages,
948                            RSSAggr->Pos,
949                            &len,
950                            &Key,
951                            (void**) &RSSAggr->ThisMsg)) {
952                 return NextDBOperation(IO, RSS_FetchNetworkUsetableEntry);
953         }
954         else {
955                 return eAbort;
956         }
957 }
958
959
960 /******************************************************************************
961  *                    RSS handler registering logic                           *
962  ******************************************************************************/
963
964 void AddRSSStartHandler(rss_handler_func Handler,
965                         int Flags,
966                         const char *key,
967                         long len)
968 {
969         rss_xml_handler *h;
970         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
971         h->Flags = Flags;
972         h->Handler = Handler;
973         Put(StartHandlers, key, len, h, NULL);
974 }
975
976 void AddRSSEndHandler(rss_handler_func Handler,
977                       int Flags,
978                       const char *key,
979                       long len)
980 {
981         rss_xml_handler *h;
982         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
983         h->Flags = Flags;
984         h->Handler = Handler;
985         Put(EndHandlers, key, len, h, NULL);
986 }
987
988 void rss_parser_cleanup(void)
989 {
990         DeleteHash(&StartHandlers);
991         DeleteHash(&EndHandlers);
992         DeleteHash(&KnownNameSpaces);
993 }
994
995 void LogDebugEnableRSSATOMParser(const int n)
996 {
997         RSSAtomParserDebugEnabled = n;
998 }
999
1000 CTDL_MODULE_INIT(rssparser)
1001 {
1002         if (!threading)
1003         {
1004                 StartHandlers = NewHash(1, NULL);
1005                 EndHandlers = NewHash(1, NULL);
1006
1007                 AddRSSStartHandler(RSS_item_rss_start,     RSS_UNSET, HKEY("rss"));
1008                 AddRSSStartHandler(RSS_item_rdf_start,     RSS_UNSET, HKEY("rdf"));
1009                 AddRSSStartHandler(ATOM_item_feed_start,   RSS_UNSET, HKEY("feed"));
1010                 AddRSSStartHandler(RSS_item_item_start,    RSS_RSS, HKEY("item"));
1011                 AddRSSStartHandler(ATOM_item_entry_start,  RSS_ATOM, HKEY("entry"));
1012                 AddRSSStartHandler(ATOM_item_link_start,   RSS_ATOM, HKEY("link"));
1013
1014                 AddRSSEndHandler(ATOMRSS_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
1015                 AddRSSEndHandler(RSS_item_guid_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("guid"));
1016                 AddRSSEndHandler(ATOM_item_id_end,         RSS_ATOM|RSS_REQUIRE_BUF, HKEY("id"));
1017                 AddRSSEndHandler(RSS_item_link_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("link"));
1018 #if 0
1019 // hm, rss to the comments of that blog, might be interesting in future, but...
1020                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("commentrss"));
1021 // comment count...
1022                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("comments"));
1023 #endif
1024                 AddRSSEndHandler(RSSATOM_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
1025                 AddRSSEndHandler(ATOM_item_content_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("content"));
1026                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_ATOM|RSS_REQUIRE_BUF, HKEY("encoded"));
1027                 AddRSSEndHandler(ATOM_item_summary_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("summary"));
1028                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_REQUIRE_BUF, HKEY("description"));
1029                 AddRSSEndHandler(ATOM_item_published_end,  RSS_ATOM|RSS_REQUIRE_BUF, HKEY("published"));
1030                 AddRSSEndHandler(ATOM_item_updated_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("updated"));
1031                 AddRSSEndHandler(RSS_item_pubdate_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("pubdate"));
1032                 AddRSSEndHandler(RSS_item_date_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("date"));
1033                 AddRSSEndHandler(RSS_item_author_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("author"));
1034                 AddRSSEndHandler(RSS_item_creator_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("creator"));
1035 /* <author> */
1036                 AddRSSEndHandler(ATOM_item_email_end,      RSS_ATOM|RSS_REQUIRE_BUF, HKEY("email"));
1037                 AddRSSEndHandler(ATOM_item_name_end,       RSS_ATOM|RSS_REQUIRE_BUF, HKEY("name"));
1038                 AddRSSEndHandler(ATOM_item_uri_end,        RSS_ATOM|RSS_REQUIRE_BUF, HKEY("uri"));
1039 /* </author> */
1040                 AddRSSEndHandler(RSS_item_item_end,        RSS_RSS, HKEY("item"));
1041                 AddRSSEndHandler(RSS_item_rss_end,         RSS_RSS, HKEY("rss"));
1042                 AddRSSEndHandler(RSS_item_rdf_end,         RSS_RSS, HKEY("rdf"));
1043                 AddRSSEndHandler(ATOM_item_entry_end,      RSS_ATOM, HKEY("entry"));
1044
1045
1046 /* at the start of atoms: <seq> <li>link to resource</li></seq> ignore them. */
1047                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
1048                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
1049                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
1050                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
1051
1052 /* links to other feed generators... */
1053                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
1054                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
1055                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
1056                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
1057
1058                 KnownNameSpaces = NewHash(1, NULL);
1059                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearch/1.1/"), NULL, reference_free_handler);
1060                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearchrss/1.0/"), NULL, reference_free_handler);
1061                 Put(KnownNameSpaces, HKEY("http://backend.userland.com/creativeCommonsRssModule"), NULL, reference_free_handler);
1062                 Put(KnownNameSpaces, HKEY("http://purl.org/atom/ns#"), NULL, reference_free_handler);
1063                 Put(KnownNameSpaces, HKEY("http://purl.org/dc/elements/1.1/"), NULL, reference_free_handler);
1064                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
1065                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/content/"), NULL, reference_free_handler);
1066                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/slash/"), NULL, reference_free_handler);
1067                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/syndication/"), NULL, reference_free_handler);
1068                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
1069                 Put(KnownNameSpaces, HKEY("http://purl.org/syndication/thread/1.0"), NULL, reference_free_handler);
1070                 Put(KnownNameSpaces, HKEY("http://rssnamespace.org/feedburner/ext/1.0"), NULL, reference_free_handler);
1071                 Put(KnownNameSpaces, HKEY("http://schemas.google.com/g/2005"), NULL, reference_free_handler);
1072                 Put(KnownNameSpaces, HKEY("http://webns.net/mvcb/"), NULL, reference_free_handler);
1073                 Put(KnownNameSpaces, HKEY("http://web.resource.org/cc/"), NULL, reference_free_handler);
1074                 Put(KnownNameSpaces, HKEY("http://wellformedweb.org/CommentAPI/"), NULL, reference_free_handler);
1075                 Put(KnownNameSpaces, HKEY("http://www.georss.org/georss"), NULL, reference_free_handler);
1076                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/xhtml"), NULL, reference_free_handler);
1077                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1078                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1079                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2003/01/geo/wgs84_pos#"), NULL, reference_free_handler);
1080                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2005/Atom"), NULL, reference_free_handler);
1081                 Put(KnownNameSpaces, HKEY("urn:flickr:"), NULL, reference_free_handler);
1082 #if 0
1083                 /* we don't like these namespaces because of they shadow our usefull parameters. */
1084                 Put(KnownNameSpaces, HKEY("http://search.yahoo.com/mrss/"), NULL, reference_free_handler);
1085 #endif
1086                 CtdlRegisterDebugFlagHook(HKEY("RSSAtomParser"), LogDebugEnableRSSATOMParser, &RSSAtomParserDebugEnabled);
1087                 CtdlRegisterCleanupHook(rss_parser_cleanup);
1088         }
1089         return "rssparser";
1090 }