Cleanup of shutdown of event contexts
[citadel.git] / citadel / modules / rssclient / rss_atom_parser.c
1 /*
2  * Bring external RSS feeds into rooms.
3  *
4  * Copyright (c) 2007-2010 by the citadel.org team
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  */
20
21 #include <stdlib.h>
22 #include <unistd.h>
23 #include <stdio.h>
24
25 #if TIME_WITH_SYS_TIME
26 # include <sys/time.h>
27 # include <time.h>
28 #else
29 # if HAVE_SYS_TIME_H
30 #  include <sys/time.h>
31 # else
32 #  include <time.h>
33 # endif
34 #endif
35
36 #include <ctype.h>
37 #include <string.h>
38 #include <errno.h>
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <expat.h>
42 #include <curl/curl.h>
43 #include <libcitadel.h>
44 #include "citadel.h"
45 #include "server.h"
46 #include "citserver.h"
47 #include "support.h"
48 #include "config.h"
49 #include "threads.h"
50 #include "ctdl_module.h"
51 #include "clientsocket.h"
52 #include "msgbase.h"
53 #include "parsedate.h"
54 #include "database.h"
55 #include "citadel_dirs.h"
56 #include "md5.h"
57 #include "context.h"
58 #include "event_client.h"
59 #include "rss_atom_parser.h"
60
61 void rss_save_item(rss_item *ri, rss_aggregator *Cfg);
62
63
64 /*
65  * Convert an RDF/RSS datestamp into a time_t
66  */
67 time_t rdf_parsedate(const char *p)
68 {
69         struct tm tm;
70         time_t t = 0;
71
72         if (!p) return 0L;
73         if (strlen(p) < 10) return 0L;
74
75         memset(&tm, 0, sizeof tm);
76
77         /*
78          * If the timestamp appears to be in W3C datetime format, try to
79          * parse it.  See also: http://www.w3.org/TR/NOTE-datetime
80          *
81          * This code, along with parsedate.c, is a potential candidate for
82          * moving into libcitadel.
83          */
84         if ( (p[4] == '-') && (p[7] == '-') ) {
85                 tm.tm_year = atoi(&p[0]) - 1900;
86                 tm.tm_mon = atoi(&p[5]) - 1;
87                 tm.tm_mday = atoi(&p[8]);
88                 if ( (p[10] == 'T') && (p[13] == ':') ) {
89                         tm.tm_hour = atoi(&p[11]);
90                         tm.tm_min = atoi(&p[14]);
91                 }
92                 return mktime(&tm);
93         }
94
95         /* hmm... try RFC822 date stamp format */
96
97         t = parsedate(p);
98         if (t > 0) return(t);
99
100         /* yeesh.  ok, just return the current date and time. */
101         return(time(NULL));
102 }
103
104 void flush_rss_item(rss_item *ri)
105 {
106         /* Initialize the feed item data structure */
107         FreeStrBuf(&ri->guid);
108         FreeStrBuf(&ri->title);
109         FreeStrBuf(&ri->link);
110         FreeStrBuf(&ri->author_or_creator);
111         FreeStrBuf(&ri->author_email);
112         FreeStrBuf(&ri->author_url);
113         FreeStrBuf(&ri->description);
114
115         FreeStrBuf(&ri->linkTitle);
116         FreeStrBuf(&ri->reLink);
117         FreeStrBuf(&ri->reLinkTitle);
118         FreeStrBuf(&ri->channel_title);
119 }
120
121
122 /*******************************************************************************
123  *                               XML-Handler                                   *
124  *******************************************************************************/
125
126
127 void RSS_item_rss_start (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
128 {
129         syslog(LOG_DEBUG, "RSS: This is an RSS feed.\n");
130         Cfg->ItemType = RSS_RSS;
131 }
132
133 void RSS_item_rdf_start(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
134 {
135         syslog(LOG_DEBUG, "RSS: This is an RDF feed.\n");
136         Cfg->ItemType = RSS_RSS;
137 }
138
139 void ATOM_item_feed_start(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
140 {
141         syslog(LOG_DEBUG, "RSS: This is an ATOM feed.\n");
142         Cfg->ItemType = RSS_ATOM;
143 }
144
145
146 void RSS_item_item_start(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
147 {
148         ri->item_tag_nesting ++;
149         flush_rss_item(ri);
150 }
151
152 void ATOM_item_entry_start(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
153 {
154 /* Atom feed... */
155         ri->item_tag_nesting ++;
156         flush_rss_item(ri);
157 }
158
159 void ATOM_item_link_start (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
160 {
161         int i;
162         const char *pHref = NULL;
163         const char *pType = NULL;
164         const char *pRel = NULL;
165         const char *pTitle = NULL;
166
167         for (i = 0; Attr[i] != NULL; i+=2)
168         {
169                 if (!strcmp(Attr[i], "href"))
170                 {
171                         pHref = Attr[i+1];
172                 }
173                 else if (!strcmp(Attr[i], "rel"))
174                 {
175                         pRel = Attr[i+1];
176                 }
177                 else if (!strcmp(Attr[i], "type"))
178                 {
179                         pType = Attr[i+1];
180                 }
181                 else if (!strcmp(Attr[i], "title"))
182                 {
183                         pTitle = Attr[i+1];
184                 }
185         }
186         if (pHref == NULL)
187                 return; /* WHUT? Pointing... where? */
188         if ((pType != NULL) && !strcasecmp(pType, "application/atom+xml"))
189                 return; /* these just point to other rss resources, we're not interested in them. */
190         if (pRel != NULL)
191         {
192                 if (!strcasecmp (pRel, "replies"))
193                 {
194                         NewStrBufDupAppendFlush(&ri->reLink, NULL, pHref, -1);
195                         StrBufTrim(ri->link);
196                         NewStrBufDupAppendFlush(&ri->reLinkTitle, NULL, pTitle, -1);
197                 }
198                 else if (!strcasecmp(pRel, "alternate")) /* Alternative representation of this Item... */
199                 {
200                         NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
201                         StrBufTrim(ri->link);
202                         NewStrBufDupAppendFlush(&ri->linkTitle, NULL, pTitle, -1);
203
204                 }
205 #if 0 /* these are also defined, but dunno what to do with them.. */
206                 else if (!strcasecmp(pRel, "related"))
207                 {
208                 }
209                 else if (!strcasecmp(pRel, "self"))
210                 {
211                 }
212                 else if (!strcasecmp(pRel, "enclosure"))
213                 {/* this reference can get big, and is probably the full article... */
214                 }
215                 else if (!strcasecmp(pRel, "via"))
216                 {/* this article was provided via... */
217                 }
218 #endif
219         }
220         else if (StrLength(ri->link) == 0)
221         {
222                 NewStrBufDupAppendFlush(&ri->link, NULL, pHref, -1);
223                 StrBufTrim(ri->link);
224                 NewStrBufDupAppendFlush(&ri->linkTitle, NULL, pTitle, -1);
225         }
226 }
227
228
229
230
231 void ATOMRSS_item_title_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
232 {
233         if ((ri->item_tag_nesting == 0) && (StrLength(CData) > 0)) {
234                 NewStrBufDupAppendFlush(&ri->channel_title, CData, NULL, 0);
235                 StrBufTrim(ri->channel_title);
236         }
237 }
238
239 void RSS_item_guid_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
240 {
241         if (StrLength(CData) > 0) {
242                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
243         }
244 }
245
246 void ATOM_item_id_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
247 {
248         if (StrLength(CData) > 0) {
249                 NewStrBufDupAppendFlush(&ri->guid, CData, NULL, 0);
250         }
251 }
252
253
254 void RSS_item_link_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
255 {
256         if (StrLength(CData) > 0) {
257                 NewStrBufDupAppendFlush(&ri->link, CData, NULL, 0);
258                 StrBufTrim(ri->link);
259         }
260 }
261 void RSS_item_relink_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
262 {
263         if (StrLength(CData) > 0) {
264                 NewStrBufDupAppendFlush(&ri->reLink, CData, NULL, 0);
265                 StrBufTrim(ri->reLink);
266         }
267 }
268
269 void RSSATOM_item_title_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
270 {
271         if (StrLength(CData) > 0) {
272                 NewStrBufDupAppendFlush(&ri->title, CData, NULL, 0);
273                 StrBufTrim(ri->title);
274         }
275 }
276
277 void ATOM_item_content_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
278 {
279         long olen = StrLength (ri->description);
280         long clen = StrLength (CData);
281         if (clen > 0) 
282         {
283                 if (olen == 0) {
284                         NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0);
285                         StrBufTrim(ri->description);
286                 }
287                 else if (olen < clen) {
288                         FlushStrBuf(ri->description);
289                         NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0);
290                         StrBufTrim(ri->description);
291                 }
292         }
293 }
294 void ATOM_item_summary_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
295 {
296         /* this can contain an abstract of the article. but we don't want to verwrite a full document if we already have it. */
297         if ((StrLength(CData) > 0) && (StrLength(ri->description) == 0))
298         {
299                 NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0);
300                 StrBufTrim(ri->description);
301         }
302 }
303
304 void RSS_item_description_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
305 {
306         long olen = StrLength (ri->description);
307         long clen = StrLength (CData);
308         if (clen > 0) 
309         {
310                 if (olen == 0) {
311                         NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0);
312                         StrBufTrim(ri->description);
313                 }
314                 else if (olen < clen) {
315                         FlushStrBuf(ri->description);
316                         NewStrBufDupAppendFlush(&ri->description, CData, NULL, 0);
317                         StrBufTrim(ri->description);
318                 }
319         }
320 }
321
322 void ATOM_item_published_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
323 {                 
324         if (StrLength(CData) > 0) {
325                 StrBufTrim(CData);
326                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
327         }
328 }
329
330 void ATOM_item_updated_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
331 {
332         if (StrLength(CData) > 0) {
333                 StrBufTrim(CData);
334                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
335         }
336 }
337
338 void RSS_item_pubdate_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
339 {
340         if (StrLength(CData) > 0) {
341                 StrBufTrim(CData);
342                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
343         }
344 }
345
346
347 void RSS_item_date_end (StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
348 {
349         if (StrLength(CData) > 0) {
350                 StrBufTrim(CData);
351                 ri->pubdate = rdf_parsedate(ChrPtr(CData));
352         }
353 }
354
355
356
357 void RSS_item_author_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
358 {
359         if (StrLength(CData) > 0) {
360                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
361                 StrBufTrim(ri->author_or_creator);
362         }
363 }
364
365
366 void ATOM_item_name_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
367 {
368         if (StrLength(CData) > 0) {
369                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
370                 StrBufTrim(ri->author_or_creator);
371         }
372 }
373
374 void ATOM_item_email_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
375 {
376         if (StrLength(CData) > 0) {
377                 NewStrBufDupAppendFlush(&ri->author_email, CData, NULL, 0);
378                 StrBufTrim(ri->author_email);
379         }
380 }
381
382 void RSS_item_creator_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
383 {
384         if ((StrLength(CData) > 0) && 
385             (StrLength(ri->author_or_creator) == 0))
386         {
387                 NewStrBufDupAppendFlush(&ri->author_or_creator, CData, NULL, 0);
388                 StrBufTrim(ri->author_or_creator);
389         }
390 }
391
392
393 void ATOM_item_uri_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
394 {
395         if (StrLength(CData) > 0) {
396                 NewStrBufDupAppendFlush(&ri->author_url, CData, NULL, 0);
397                 StrBufTrim(ri->author_url);
398         }
399 }
400
401 void RSS_item_item_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
402 {
403         --ri->item_tag_nesting;
404         rss_save_item(ri, Cfg);
405 }
406
407
408 void ATOM_item_entry_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
409 {
410         --ri->item_tag_nesting;
411         rss_save_item(ri, Cfg);
412 }
413
414 void RSS_item_rss_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
415 {
416 //              syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
417         ri->done_parsing = 1;
418         
419 }
420 void RSS_item_rdf_end(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
421 {
422 //              syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
423         ri->done_parsing = 1;
424 }
425
426
427 void RSSATOM_item_ignore(StrBuf *CData, rss_item *ri, rss_aggregator *Cfg, const char** Attr)
428 {
429 }
430
431
432
433 /*
434  * This callback stores up the data which appears in between tags.
435  */
436 void rss_xml_cdata_start(void *data) 
437 {
438         rss_aggregator *RSSAggr = (rss_aggregator*) data;
439
440         FlushStrBuf(RSSAggr->CData);
441 }
442
443 void rss_xml_cdata_end(void *data) 
444 {
445 }
446 void rss_xml_chardata(void *data, const XML_Char *s, int len) 
447 {
448         rss_aggregator *RSSAggr = (rss_aggregator*) data;
449
450         StrBufAppendBufPlain (RSSAggr->CData, s, len, 0);
451 }
452
453
454 /*******************************************************************************
455  *                            RSS parser logic                                 *
456  *******************************************************************************/
457
458 extern pthread_mutex_t RSSQueueMutex;
459
460 HashList *StartHandlers = NULL;
461 HashList *EndHandlers = NULL;
462 HashList *KnownNameSpaces = NULL;
463
464 void FreeNetworkSaveMessage (void *vMsg)
465 {
466         networker_save_message *Msg = (networker_save_message *) vMsg;
467
468         CtdlFreeMessageContents(&Msg->Msg);
469         FreeStrBuf(&Msg->Message);
470         FreeStrBuf(&Msg->MsgGUID);
471         free(Msg);
472 }
473
474
475 void AppendLink(StrBuf *Message,
476                 StrBuf *link,
477                 StrBuf *LinkTitle,
478                 const char *Title)
479 {
480         if (StrLength(link) > 0)
481         {
482                 StrBufAppendBufPlain(Message, HKEY("<a href=\""), 0);
483                 StrBufAppendBuf(Message, link, 0);
484                 StrBufAppendBufPlain(Message, HKEY("\">"), 0);
485                 if (StrLength(LinkTitle) > 0)
486                         StrBufAppendBuf(Message, LinkTitle, 0);
487                 else if ((Title != NULL) && !IsEmptyStr(Title))
488                         StrBufAppendBufPlain(Message, Title, -1, 0);
489                 else
490                         StrBufAppendBuf(Message, link, 0);
491                 StrBufAppendBufPlain(Message, HKEY("</a><br>\n"), 0);
492         }
493 }
494
495 /*
496  * Commit a fetched and parsed RSS item to disk
497  */
498 void rss_save_item(rss_item *ri, rss_aggregator *Cfg)
499 {
500         networker_save_message *SaveMsg;
501         struct MD5Context md5context;
502         u_char rawdigest[MD5_DIGEST_LEN];
503         int msglen = 0;
504         StrBuf *Message;
505         StrBuf *guid;
506         AsyncIO *IO = &Cfg->IO;
507         int n;
508
509
510         SaveMsg = (networker_save_message *) malloc(
511                 sizeof(networker_save_message));
512         memset(SaveMsg, 0, sizeof(networker_save_message));
513
514         /* Construct a GUID to use in the S_USETABLE table.
515          * If one is not present in the item itself, make one up.
516          */
517         if (ri->guid != NULL) {
518                 StrBufSpaceToBlank(ri->guid);
519                 StrBufTrim(ri->guid);
520                 guid = NewStrBufPlain(HKEY("rss/"));
521                 StrBufAppendBuf(guid, ri->guid, 0);
522         }
523         else {
524                 MD5Init(&md5context);
525                 if (ri->title != NULL) {
526                         MD5Update(&md5context,
527                                   (const unsigned char*)SKEY(ri->title));
528                 }
529                 if (ri->link != NULL) {
530                         MD5Update(&md5context,
531                                   (const unsigned char*)SKEY(ri->link));
532                 }
533                 MD5Final(rawdigest, &md5context);
534                 guid = NewStrBufPlain(NULL,
535                                       MD5_DIGEST_LEN * 2 + 12 /* _rss2ctdl*/);
536                 StrBufHexEscAppend(guid, NULL, rawdigest, MD5_DIGEST_LEN);
537                 StrBufAppendBufPlain(guid, HKEY("_rss2ctdl"), 0);
538         }
539
540         /* translate Item into message. */
541         EVM_syslog(LOG_DEBUG, "RSS: translating item...\n");
542         if (ri->description == NULL) ri->description = NewStrBufPlain(HKEY(""));
543         StrBufSpaceToBlank(ri->description);
544         SaveMsg->Msg.cm_magic = CTDLMESSAGE_MAGIC;
545         SaveMsg->Msg.cm_anon_type = MES_NORMAL;
546         SaveMsg->Msg.cm_format_type = FMT_RFC822;
547
548         if (ri->guid != NULL) {
549                 SaveMsg->Msg.cm_fields['E'] = strdup(ChrPtr(ri->guid));
550         }
551
552         if (ri->author_or_creator != NULL) {
553                 char *From;
554                 StrBuf *Encoded = NULL;
555                 int FromAt;
556
557                 From = html_to_ascii(ChrPtr(ri->author_or_creator),
558                                      StrLength(ri->author_or_creator),
559                                      512, 0);
560                 StrBufPlain(ri->author_or_creator, From, -1);
561                 StrBufTrim(ri->author_or_creator);
562                 free(From);
563
564                 FromAt = strchr(ChrPtr(ri->author_or_creator), '@') != NULL;
565                 if (!FromAt && StrLength (ri->author_email) > 0)
566                 {
567                         StrBufRFC2047encode(&Encoded, ri->author_or_creator);
568                         SaveMsg->Msg.cm_fields['A'] = SmashStrBuf(&Encoded);
569                         SaveMsg->Msg.cm_fields['P'] =
570                                 SmashStrBuf(&ri->author_email);
571                 }
572                 else
573                 {
574                         if (FromAt)
575                         {
576                                 SaveMsg->Msg.cm_fields['A'] =
577                                         SmashStrBuf(&ri->author_or_creator);
578                                 SaveMsg->Msg.cm_fields['P'] =
579                                         strdup(SaveMsg->Msg.cm_fields['A']);
580                         }
581                         else
582                         {
583                                 StrBufRFC2047encode(&Encoded,
584                                                     ri->author_or_creator);
585                                 SaveMsg->Msg.cm_fields['A'] =
586                                         SmashStrBuf(&Encoded);
587                                 SaveMsg->Msg.cm_fields['P'] =
588                                         strdup("rss@localhost");
589
590                         }
591                         if (ri->pubdate <= 0) {
592                                 ri->pubdate = time(NULL);
593                         }
594                 }
595         }
596         else {
597                 SaveMsg->Msg.cm_fields['A'] = strdup("rss");
598         }
599
600         SaveMsg->Msg.cm_fields['N'] = strdup(NODENAME);
601         if (ri->title != NULL) {
602                 long len;
603                 char *Sbj;
604                 StrBuf *Encoded, *QPEncoded;
605
606                 QPEncoded = NULL;
607                 StrBufSpaceToBlank(ri->title);
608                 len = StrLength(ri->title);
609                 Sbj = html_to_ascii(ChrPtr(ri->title), len, 512, 0);
610                 len = strlen(Sbj);
611                 if (Sbj[len - 1] == '\n')
612                 {
613                         len --;
614                         Sbj[len] = '\0';
615                 }
616                 Encoded = NewStrBufPlain(Sbj, len);
617                 free(Sbj);
618
619                 StrBufTrim(Encoded);
620                 StrBufRFC2047encode(&QPEncoded, Encoded);
621
622                 SaveMsg->Msg.cm_fields['U'] = SmashStrBuf(&QPEncoded);
623                 FreeStrBuf(&Encoded);
624         }
625         SaveMsg->Msg.cm_fields['T'] = malloc(64);
626         snprintf(SaveMsg->Msg.cm_fields['T'], 64, "%ld", ri->pubdate);
627         if (ri->channel_title != NULL) {
628                 if (StrLength(ri->channel_title) > 0) {
629                         SaveMsg->Msg.cm_fields['O'] =
630                                 strdup(ChrPtr(ri->channel_title));
631                 }
632         }
633         if (ri->link == NULL)
634                 ri->link = NewStrBufPlain(HKEY(""));
635
636 #if 0 /* temporarily disable shorter urls. */
637         SaveMsg->Msg.cm_fields[TMP_SHORTER_URLS] =
638                 GetShorterUrls(ri->description);
639 #endif
640
641         msglen += 1024 + StrLength(ri->link) + StrLength(ri->description) ;
642
643         Message = NewStrBufPlain(NULL, StrLength(ri->description));
644
645         StrBufPlain(Message, HKEY(
646                             "Content-type: text/html; charset=\"UTF-8\"\r\n\r\n"
647                             "<html><body>\n"));
648 #if 0 /* disable shorter url for now. */
649         SaveMsg->Msg.cm_fields[TMP_SHORTER_URL_OFFSET] = StrLength(Message);
650 #endif
651         StrBufAppendBuf(Message, ri->description, 0);
652         StrBufAppendBufPlain(Message, HKEY("<br><br>\n"), 0);
653
654         AppendLink(Message, ri->link, ri->linkTitle, NULL);
655         AppendLink(Message, ri->reLink, ri->reLinkTitle, "Reply to this");
656         StrBufAppendBufPlain(Message, HKEY("</body></html>\n"), 0);
657
658         SaveMsg->MsgGUID = guid;
659         SaveMsg->Message = Message;
660
661         n = GetCount(Cfg->Messages) + 1;
662         Put(Cfg->Messages, IKEY(n), SaveMsg, FreeNetworkSaveMessage);
663 }
664
665
666 void rss_xml_start(void *data, const char *supplied_el, const char **attr)
667 {
668         rss_xml_handler *h;
669         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
670         rss_item        *ri = RSSAggr->Item;
671         void            *pv;
672         const char      *pel;
673         char            *sep = NULL;
674
675         /* Axe the namespace, we don't care about it */
676 ///     syslog(LOG_DEBUG, "RSS: supplied el %d: %s...\n", RSSAggr->Cfg->ItemType, supplied_el);
677         pel = supplied_el;
678         while (sep = strchr(pel, ':'), sep) {
679                 pel = sep + 1;
680         }
681
682         if (pel != supplied_el)
683         {
684                 void *v;
685                 
686                 if (!GetHash(KnownNameSpaces, 
687                              supplied_el, 
688                              pel - supplied_el - 1,
689                              &v))
690                 {
691 #ifdef DEBUG_RSS
692                         syslog(LOG_DEBUG, "RSS: START ignoring because of wrong namespace [%s]\n", 
693                                       supplied_el);
694 #endif
695                         return;
696                 }
697         }
698
699         StrBufPlain(RSSAggr->Key, pel, -1);
700         StrBufLowerCase(RSSAggr->Key);
701         if (GetHash(StartHandlers, SKEY(RSSAggr->Key), &pv))
702         {
703                 h = (rss_xml_handler*) pv;
704
705                 if (((h->Flags & RSS_UNSET) != 0) && 
706                     (RSSAggr->ItemType == RSS_UNSET))
707                 {
708                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
709                 }
710                 else if (((h->Flags & RSS_RSS) != 0) &&
711                     (RSSAggr->ItemType == RSS_RSS))
712                 {
713                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);
714                 }
715                 else if (((h->Flags & RSS_ATOM) != 0) &&
716                          (RSSAggr->ItemType == RSS_ATOM))
717                 {
718                         h->Handler(RSSAggr->CData, ri, RSSAggr, attr);                  
719                 }
720 #ifdef DEBUG_RSS
721                 else 
722                         syslog(LOG_DEBUG, "RSS: START unhandled: [%s] [%s]...\n", pel, supplied_el);
723 #endif
724         }
725 #ifdef DEBUG_RSS
726         else 
727                 syslog(LOG_DEBUG, "RSS: START unhandled: [%s] [%s]...\n", pel,  supplied_el);
728 #endif
729 }
730
731 void rss_xml_end(void *data, const char *supplied_el)
732 {
733         rss_xml_handler *h;
734         rss_aggregator  *RSSAggr = (rss_aggregator*) data;
735         rss_item        *ri = RSSAggr->Item;
736         const char      *pel;
737         char            *sep = NULL;
738         void            *pv;
739
740         /* Axe the namespace, we don't care about it */
741         pel = supplied_el;
742         while (sep = strchr(pel, ':'), sep) {
743                 pel = sep + 1;
744         }
745 //      syslog(LOG_DEBUG, "RSS: END %s...\n", el);
746         if (pel != supplied_el)
747         {
748                 void *v;
749                 
750                 if (!GetHash(KnownNameSpaces, 
751                              supplied_el, 
752                              pel - supplied_el - 1,
753                              &v))
754                 {
755 #ifdef DEBUG_RSS
756                         syslog(LOG_DEBUG, "RSS: END ignoring because of wrong namespace [%s] = [%s]\n", 
757                                       supplied_el, ChrPtr(RSSAggr->CData));
758 #endif
759                         FlushStrBuf(RSSAggr->CData);
760                         return;
761                 }
762         }
763
764         StrBufPlain(RSSAggr->Key, pel, -1);
765         StrBufLowerCase(RSSAggr->Key);
766         if (GetHash(EndHandlers, SKEY(RSSAggr->Key), &pv))
767         {
768                 h = (rss_xml_handler*) pv;
769
770                 if (((h->Flags & RSS_UNSET) != 0) && 
771                     (RSSAggr->ItemType == RSS_UNSET))
772                 {
773                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
774                 }
775                 else if (((h->Flags & RSS_RSS) != 0) &&
776                     (RSSAggr->ItemType == RSS_RSS))
777                 {
778                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
779                 }
780                 else if (((h->Flags & RSS_ATOM) != 0) &&
781                          (RSSAggr->ItemType == RSS_ATOM))
782                 {
783                         h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
784                 }
785 #ifdef DEBUG_RSS
786                 else 
787                         syslog(LOG_DEBUG, "RSS: END   unhandled: [%s]  [%s] = [%s]...\n", pel, supplied_el, ChrPtr(RSSAggr->CData));
788 #endif
789         }
790 #ifdef DEBUG_RSS
791         else 
792                 syslog(LOG_DEBUG, "RSS: END   unhandled: [%s]  [%s] = [%s]...\n", pel, supplied_el, ChrPtr(RSSAggr->CData));
793 #endif
794         FlushStrBuf(RSSAggr->CData);
795 }
796
797 /*
798  * Callback function for passing libcurl's output to expat for parsing
799  * we don't do streamed parsing so expat can handle non-utf8 documents
800 size_t rss_libcurl_callback(void *ptr, size_t size, size_t nmemb, void *stream)
801 {
802         XML_Parse((XML_Parser)stream, ptr, (size * nmemb), 0);
803         return (size*nmemb);
804 }
805  */
806
807 eNextState RSSAggregator_ParseReply(AsyncIO *IO)
808 {
809         StrBuf *Buf;
810         rss_aggregator *RSSAggr;
811         rss_item *ri;
812         const char *at;
813         char *ptr;
814         long len;
815         const char *Key;
816
817
818         if (IO->HttpReq.httpcode != 200)
819         {
820
821                 EV_syslog(LOG_DEBUG, "need a 200, got a %ld !\n",
822                           IO->HttpReq.httpcode);
823 // TODO: aide error message with rate limit
824                 return eAbort;
825         }
826
827         RSSAggr = IO->Data;
828         ri = RSSAggr->Item;
829         RSSAggr->CData = NewStrBufPlain(NULL, SIZ);
830         RSSAggr->Key = NewStrBuf();
831         at = NULL;
832         StrBufSipLine(RSSAggr->Key, IO->HttpReq.ReplyData, &at);
833         ptr = NULL;
834
835 #define encoding "encoding=\""
836         ptr = strstr(ChrPtr(RSSAggr->Key), encoding);
837         if (ptr != NULL)
838         {
839                 char *pche;
840
841                 ptr += sizeof (encoding) - 1;
842                 pche = strchr(ptr, '"');
843                 if (pche != NULL)
844                         StrBufCutAt(RSSAggr->Key, -1, pche);
845                 else 
846                         ptr = "UTF-8";
847         }
848         else
849                 ptr = "UTF-8";
850
851         syslog(LOG_DEBUG, "RSS: Now parsing [%s] \n", ChrPtr(RSSAggr->Url));
852
853         RSSAggr->xp = XML_ParserCreateNS(ptr, ':');
854         if (!RSSAggr->xp) {
855                 syslog(LOG_DEBUG, "Cannot create XML parser!\n");
856                 return eAbort;
857         }
858         FlushStrBuf(RSSAggr->Key);
859
860         RSSAggr->Messages = NewHash(1, Flathash);
861         XML_SetElementHandler(RSSAggr->xp, rss_xml_start, rss_xml_end);
862         XML_SetCharacterDataHandler(RSSAggr->xp, rss_xml_chardata);
863         XML_SetUserData(RSSAggr->xp, RSSAggr);
864         XML_SetCdataSectionHandler(RSSAggr->xp,
865                                    rss_xml_cdata_start,
866                                    rss_xml_cdata_end);
867
868
869         len = StrLength(IO->HttpReq.ReplyData);
870         ptr = SmashStrBuf(&IO->HttpReq.ReplyData);
871         XML_Parse(RSSAggr->xp, ptr, len, 0);
872         free (ptr);
873         if (ri->done_parsing == 0)
874                 XML_Parse(RSSAggr->xp, "", 0, 1);
875
876
877         syslog(LOG_DEBUG, "RSS: XML Status [%s] \n", 
878                       XML_ErrorString(
879                               XML_GetErrorCode(RSSAggr->xp)));
880
881         XML_ParserFree(RSSAggr->xp);
882         flush_rss_item(ri);
883
884         Buf = NewStrBufDup(RSSAggr->rooms);
885         RSSAggr->recp.recp_room = SmashStrBuf(&Buf);
886         RSSAggr->recp.num_room = RSSAggr->roomlist_parts;
887         RSSAggr->recp.recptypes_magic = RECPTYPES_MAGIC;
888
889         RSSAggr->Pos = GetNewHashPos(RSSAggr->Messages, 1);
890
891         ///Cfg->next_poll = time(NULL) + config.c_net_freq; 
892         if (GetNextHashPos(RSSAggr->Messages, RSSAggr->Pos, &len, &Key, (void**) &RSSAggr->ThisMsg))
893                 return QueueDBOperation(IO, RSS_FetchNetworkUsetableEntry);
894         else
895                 return eAbort;
896 }
897
898
899 /*******************************************************************************
900  *                     RSS handler registering logic                           *
901  *******************************************************************************/
902
903 void AddRSSStartHandler(rss_handler_func Handler, int Flags, const char *key, long len)
904 {
905         rss_xml_handler *h;
906         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
907         h->Flags = Flags;
908         h->Handler = Handler;
909         Put(StartHandlers, key, len, h, NULL);
910 }
911 void AddRSSEndHandler(rss_handler_func Handler, int Flags, const char *key, long len)
912 {
913         rss_xml_handler *h;
914         h = (rss_xml_handler*) malloc(sizeof (rss_xml_handler));
915         h->Flags = Flags;
916         h->Handler = Handler;
917         Put(EndHandlers, key, len, h, NULL);
918 }
919
920 void rss_parser_cleanup(void)
921 {
922         DeleteHash(&StartHandlers);
923         DeleteHash(&EndHandlers);
924         DeleteHash(&KnownNameSpaces);
925 }
926
927
928 CTDL_MODULE_INIT(rssparser)
929 {
930         if (!threading)
931         {
932                 StartHandlers = NewHash(1, NULL);
933                 EndHandlers = NewHash(1, NULL);
934
935                 AddRSSStartHandler(RSS_item_rss_start,     RSS_UNSET, HKEY("rss"));
936                 AddRSSStartHandler(RSS_item_rdf_start,     RSS_UNSET, HKEY("rdf"));
937                 AddRSSStartHandler(ATOM_item_feed_start,   RSS_UNSET, HKEY("feed"));
938                 AddRSSStartHandler(RSS_item_item_start,    RSS_RSS, HKEY("item"));
939                 AddRSSStartHandler(ATOM_item_entry_start,  RSS_ATOM, HKEY("entry"));
940                 AddRSSStartHandler(ATOM_item_link_start,   RSS_ATOM, HKEY("link"));
941
942                 AddRSSEndHandler(ATOMRSS_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
943                 AddRSSEndHandler(RSS_item_guid_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("guid"));
944                 AddRSSEndHandler(ATOM_item_id_end,         RSS_ATOM|RSS_REQUIRE_BUF, HKEY("id"));
945                 AddRSSEndHandler(RSS_item_link_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("link"));
946 #if 0 
947 // hm, rss to the comments of that blog, might be interesting in future, but... 
948                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("commentrss"));
949 // comment count...
950                 AddRSSEndHandler(RSS_item_relink_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("comments"));
951 #endif
952                 AddRSSEndHandler(RSSATOM_item_title_end,   RSS_ATOM|RSS_RSS|RSS_REQUIRE_BUF, HKEY("title"));
953                 AddRSSEndHandler(ATOM_item_content_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("content"));
954                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_ATOM|RSS_REQUIRE_BUF, HKEY("encoded"));
955                 AddRSSEndHandler(ATOM_item_summary_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("summary"));
956                 AddRSSEndHandler(RSS_item_description_end, RSS_RSS|RSS_REQUIRE_BUF, HKEY("description"));
957                 AddRSSEndHandler(ATOM_item_published_end,  RSS_ATOM|RSS_REQUIRE_BUF, HKEY("published"));
958                 AddRSSEndHandler(ATOM_item_updated_end,    RSS_ATOM|RSS_REQUIRE_BUF, HKEY("updated"));
959                 AddRSSEndHandler(RSS_item_pubdate_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("pubdate"));
960                 AddRSSEndHandler(RSS_item_date_end,        RSS_RSS|RSS_REQUIRE_BUF, HKEY("date"));
961                 AddRSSEndHandler(RSS_item_author_end,      RSS_RSS|RSS_REQUIRE_BUF, HKEY("author"));
962                 AddRSSEndHandler(RSS_item_creator_end,     RSS_RSS|RSS_REQUIRE_BUF, HKEY("creator"));
963 /* <author> */
964                 AddRSSEndHandler(ATOM_item_email_end,      RSS_ATOM|RSS_REQUIRE_BUF, HKEY("email"));
965                 AddRSSEndHandler(ATOM_item_name_end,       RSS_ATOM|RSS_REQUIRE_BUF, HKEY("name"));
966                 AddRSSEndHandler(ATOM_item_uri_end,        RSS_ATOM|RSS_REQUIRE_BUF, HKEY("uri"));
967 /* </author> */
968                 AddRSSEndHandler(RSS_item_item_end,        RSS_RSS, HKEY("item"));
969                 AddRSSEndHandler(RSS_item_rss_end,         RSS_RSS, HKEY("rss"));
970                 AddRSSEndHandler(RSS_item_rdf_end,         RSS_RSS, HKEY("rdf"));
971                 AddRSSEndHandler(ATOM_item_entry_end,      RSS_ATOM, HKEY("entry"));
972
973
974 /* at the start of atoms: <seq> <li>link to resource</li></seq> ignore them. */
975                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
976                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("seq"));
977                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
978                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("li"));
979
980 /* links to other feed generators... */
981                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
982                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("feedflare"));
983                 AddRSSStartHandler(RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
984                 AddRSSEndHandler  (RSSATOM_item_ignore,      RSS_RSS|RSS_ATOM, HKEY("browserfriendly"));
985
986                 KnownNameSpaces = NewHash(1, NULL);
987                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearch/1.1/"), NULL, reference_free_handler);
988                 Put(KnownNameSpaces, HKEY("http://a9.com/-/spec/opensearchrss/1.0/"), NULL, reference_free_handler);
989                 Put(KnownNameSpaces, HKEY("http://backend.userland.com/creativeCommonsRssModule"), NULL, reference_free_handler);
990                 Put(KnownNameSpaces, HKEY("http://purl.org/atom/ns#"), NULL, reference_free_handler);
991                 Put(KnownNameSpaces, HKEY("http://purl.org/dc/elements/1.1/"), NULL, reference_free_handler);
992                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
993                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/content/"), NULL, reference_free_handler);
994                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/slash/"), NULL, reference_free_handler);
995                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/modules/syndication/"), NULL, reference_free_handler);
996                 Put(KnownNameSpaces, HKEY("http://purl.org/rss/1.0/"), NULL, reference_free_handler);
997                 Put(KnownNameSpaces, HKEY("http://purl.org/syndication/thread/1.0"), NULL, reference_free_handler);
998                 Put(KnownNameSpaces, HKEY("http://rssnamespace.org/feedburner/ext/1.0"), NULL, reference_free_handler);
999                 Put(KnownNameSpaces, HKEY("http://schemas.google.com/g/2005"), NULL, reference_free_handler);
1000                 Put(KnownNameSpaces, HKEY("http://webns.net/mvcb/"), NULL, reference_free_handler);
1001                 Put(KnownNameSpaces, HKEY("http://web.resource.org/cc/"), NULL, reference_free_handler);
1002                 Put(KnownNameSpaces, HKEY("http://wellformedweb.org/CommentAPI/"), NULL, reference_free_handler);
1003                 Put(KnownNameSpaces, HKEY("http://www.georss.org/georss"), NULL, reference_free_handler);
1004                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/xhtml"), NULL, reference_free_handler);
1005                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1006                 Put(KnownNameSpaces, HKEY("http://www.w3.org/1999/02/22-rdf-syntax-ns#"), NULL, reference_free_handler);
1007                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2003/01/geo/wgs84_pos#"), NULL, reference_free_handler);
1008                 Put(KnownNameSpaces, HKEY("http://www.w3.org/2005/Atom"), NULL, reference_free_handler);
1009                 Put(KnownNameSpaces, HKEY("urn:flickr:"), NULL, reference_free_handler);
1010 #if 0
1011                 /* we don't like these namespaces because of they shadow our usefull parameters. */
1012                 Put(KnownNameSpaces, HKEY("http://search.yahoo.com/mrss/"), NULL, reference_free_handler);
1013 #endif
1014                 CtdlRegisterCleanupHook(rss_parser_cleanup);
1015         }
1016         return "rssparser";
1017 }