Move back to single threaded structure for rss feed puller
[citadel.git] / citadel / modules / rssclient / rss_atom_parser.c
index c1f5285804341e07aaa525ed0076c271fa0d8d07..7bdb94bafe59969ae87d70cb644d82e7e7f504a9 100644 (file)
@@ -1,21 +1,15 @@
 /*
  * Bring external RSS feeds into rooms.
  *
- * Copyright (c) 2007-2012 by the citadel.org team
+ * Copyright (c) 2007-2015 by the citadel.org team
  *
  * This program is open source software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 3.
  * 
- * 
- *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
- *
- * 
- * 
- * 
  */
 
 #include <stdlib.h>
 #include "event_client.h"
 #include "rss_atom_parser.h"
 
-void rss_save_item(rss_item *ri, rss_aggregator *Cfg);
+void rss_remember_item(rss_item *ri, rss_aggregator *Cfg);
 
 int RSSAtomParserDebugEnabled = 0;
 
-#define N ((rss_aggregator*)IO->Data)->QRnumber
-
-#define DBGLOG(LEVEL) if ((LEVEL != LOG_DEBUG) || (RSSAtomParserDebugEnabled != 0))
-
-#define EVRSSATOM_syslog(LEVEL, FORMAT, ...)                           \
-       DBGLOG(LEVEL) syslog(LEVEL,                                     \
-                            "IO[%ld]CC[%d][%ld]RSSP" FORMAT,           \
-                            IO->ID, CCID, N, __VA_ARGS__)
-
-#define EVRSSATOMM_syslog(LEVEL, FORMAT)                               \
-       DBGLOG(LEVEL) syslog(LEVEL,                                     \
-                            "IO[%ld]CC[%d][%ld]RSSP" FORMAT,           \
-                            IO->ID, CCID, N)
-
-#define EVRSSATOMCS_syslog(LEVEL, FORMAT, ...)                 \
-       DBGLOG(LEVEL) syslog(LEVEL, "IO[%ld][%ld]RSSP" FORMAT,  \
-                            IO->ID, N, __VA_ARGS__)
-
-#define EVRSSATOMSM_syslog(LEVEL, FORMAT)                      \
-       DBGLOG(LEVEL) syslog(LEVEL, "IO[%ld][%ld]RSSP" FORMAT,  \
-                            IO->ID, N)
+#define N ((rss_aggregator*)IO->Data)->Cfg.QRnumber
 
 /*
  * Convert an RDF/RSS datestamp into a time_t
@@ -152,8 +126,7 @@ void RSS_item_rss_start (StrBuf *CData,
                         rss_aggregator *RSSAggr,
                         const char** Attr)
 {
-       AsyncIO         *IO = &RSSAggr->IO;
-       EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an RSS feed.\n");
+       syslog(LOG_DEBUG, "RSS: This is an RSS feed.");
        RSSAggr->ItemType = RSS_RSS;
 }
 
@@ -162,8 +135,7 @@ void RSS_item_rdf_start(StrBuf *CData,
                        rss_aggregator *RSSAggr,
                        const char** Attr)
 {
-       AsyncIO         *IO = &RSSAggr->IO;
-       EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an RDF feed.\n");
+       syslog(LOG_DEBUG, "RSS: This is an RDF feed.");
        RSSAggr->ItemType = RSS_RSS;
 }
 
@@ -172,8 +144,7 @@ void ATOM_item_feed_start(StrBuf *CData,
                          rss_aggregator *RSSAggr,
                          const char** Attr)
 {
-       AsyncIO         *IO = &RSSAggr->IO;
-       EVRSSATOMM_syslog(LOG_DEBUG, "RSS: This is an ATOM feed.\n");
+       syslog(LOG_DEBUG, "RSS: This is an ATOM feed.");
        RSSAggr->ItemType = RSS_ATOM;
 }
 
@@ -524,7 +495,7 @@ void RSS_item_item_end(StrBuf *CData,
                       const char** Attr)
 {
        --ri->item_tag_nesting;
-       rss_save_item(ri, RSSAggr);
+       rss_remember_item(ri, RSSAggr);
 }
 
 
@@ -534,7 +505,7 @@ void ATOM_item_entry_end(StrBuf *CData,
                         const char** Attr)
 {
        --ri->item_tag_nesting;
-       rss_save_item(ri, RSSAggr);
+       rss_remember_item(ri, RSSAggr);
 }
 
 void RSS_item_rss_end(StrBuf *CData,
@@ -542,8 +513,7 @@ void RSS_item_rss_end(StrBuf *CData,
                      rss_aggregator *RSSAggr,
                      const char** Attr)
 {
-       AsyncIO         *IO = &RSSAggr->IO;
-       EVRSSATOMM_syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
+       syslog(LOG_DEBUG, "End of feed detected.  Closing parser.");
        ri->done_parsing = 1;
 }
 
@@ -552,8 +522,7 @@ void RSS_item_rdf_end(StrBuf *CData,
                      rss_aggregator *RSSAggr,
                      const char** Attr)
 {
-       AsyncIO         *IO = &RSSAggr->IO;
-       EVRSSATOMM_syslog(LOG_DEBUG, "End of feed detected.  Closing parser.\n");
+       syslog(LOG_DEBUG, "End of feed detected.  Closing parser.");
        ri->done_parsing = 1;
 }
 
@@ -602,50 +571,37 @@ void FreeNetworkSaveMessage (void *vMsg)
 {
        networker_save_message *Msg = (networker_save_message *) vMsg;
 
-       CtdlFreeMessageContents(&Msg->Msg);
+       CM_FreeContents(&Msg->Msg);
        FreeStrBuf(&Msg->Message);
        FreeStrBuf(&Msg->MsgGUID);
-       free(Msg);
-}
 
+       FreeStrBuf(&Msg->author_email);
+       FreeStrBuf(&Msg->author_or_creator);
+       FreeStrBuf(&Msg->title);
+       FreeStrBuf(&Msg->description);
 
-void AppendLink(StrBuf *Message,
-               StrBuf *link,
-               StrBuf *LinkTitle,
-               const char *Title)
-{
-       if (StrLength(link) > 0)
-       {
-               StrBufAppendBufPlain(Message, HKEY("<a href=\""), 0);
-               StrBufAppendBuf(Message, link, 0);
-               StrBufAppendBufPlain(Message, HKEY("\">"), 0);
-               if (StrLength(LinkTitle) > 0)
-                       StrBufAppendBuf(Message, LinkTitle, 0);
-               else if ((Title != NULL) && !IsEmptyStr(Title))
-                       StrBufAppendBufPlain(Message, Title, -1, 0);
-               else
-                       StrBufAppendBuf(Message, link, 0);
-               StrBufAppendBufPlain(Message, HKEY("</a><br>\n"), 0);
-       }
+       FreeStrBuf(&Msg->link);
+       FreeStrBuf(&Msg->linkTitle);
+
+       FreeStrBuf(&Msg->reLink);
+       FreeStrBuf(&Msg->reLinkTitle);
+
+       free(Msg);
 }
 
+
 /*
  * Commit a fetched and parsed RSS item to disk
  */
-void rss_save_item(rss_item *ri, rss_aggregator *RSSAggr)
+void rss_remember_item(rss_item *ri, rss_aggregator *RSSAggr)
 {
        networker_save_message *SaveMsg;
        struct MD5Context md5context;
        u_char rawdigest[MD5_DIGEST_LEN];
-       int msglen = 0;
-       StrBuf *Message;
        StrBuf *guid;
-       AsyncIO *IO = &RSSAggr->IO;
        int n;
 
-
-       SaveMsg = (networker_save_message *) malloc(
-               sizeof(networker_save_message));
+       SaveMsg = (networker_save_message *) malloc(sizeof(networker_save_message));
        memset(SaveMsg, 0, sizeof(networker_save_message));
 
        /* Construct a GUID to use in the S_USETABLE table.
@@ -660,151 +616,79 @@ void rss_save_item(rss_item *ri, rss_aggregator *RSSAggr)
        else {
                MD5Init(&md5context);
                if (ri->title != NULL) {
-                       MD5Update(&md5context,
-                                 (const unsigned char*)SKEY(ri->title));
+                       MD5Update(&md5context, (const unsigned char*)SKEY(ri->title));
                }
                if (ri->link != NULL) {
-                       MD5Update(&md5context,
-                                 (const unsigned char*)SKEY(ri->link));
+                       MD5Update(&md5context, (const unsigned char*)SKEY(ri->link));
                }
                MD5Final(rawdigest, &md5context);
-               guid = NewStrBufPlain(NULL,
-                                     MD5_DIGEST_LEN * 2 + 12 /* _rss2ctdl*/);
+               guid = NewStrBufPlain(NULL, MD5_DIGEST_LEN * 2 + 12 /* _rss2ctdl*/);
                StrBufHexEscAppend(guid, NULL, rawdigest, MD5_DIGEST_LEN);
                StrBufAppendBufPlain(guid, HKEY("_rss2ctdl"), 0);
        }
 
        /* translate Item into message. */
-       EVRSSATOMM_syslog(LOG_DEBUG, "RSS: translating item...\n");
+       syslog(LOG_DEBUG, "RSS: translating item...");
        if (ri->description == NULL) ri->description = NewStrBufPlain(HKEY(""));
        StrBufSpaceToBlank(ri->description);
        SaveMsg->Msg.cm_magic = CTDLMESSAGE_MAGIC;
        SaveMsg->Msg.cm_anon_type = MES_NORMAL;
        SaveMsg->Msg.cm_format_type = FMT_RFC822;
 
-       if (ri->guid != NULL) {
-               SaveMsg->Msg.cm_fields['E'] = strdup(ChrPtr(ri->guid));
-       }
-
-       if (ri->author_or_creator != NULL) {
-               char *From;
-               StrBuf *Encoded = NULL;
-               int FromAt;
-
-               From = html_to_ascii(ChrPtr(ri->author_or_creator),
-                                    StrLength(ri->author_or_creator),
-                                    512, 0);
-               StrBufPlain(ri->author_or_creator, From, -1);
-               StrBufTrim(ri->author_or_creator);
-               free(From);
+       /* gather the cheaply computed information now... */
 
-               FromAt = strchr(ChrPtr(ri->author_or_creator), '@') != NULL;
-               if (!FromAt && StrLength (ri->author_email) > 0)
-               {
-                       StrBufRFC2047encode(&Encoded, ri->author_or_creator);
-                       SaveMsg->Msg.cm_fields['A'] = SmashStrBuf(&Encoded);
-                       SaveMsg->Msg.cm_fields['P'] =
-                               SmashStrBuf(&ri->author_email);
-               }
-               else
-               {
-                       if (FromAt)
-                       {
-                               SaveMsg->Msg.cm_fields['A'] =
-                                       SmashStrBuf(&ri->author_or_creator);
-                               SaveMsg->Msg.cm_fields['P'] =
-                                       strdup(SaveMsg->Msg.cm_fields['A']);
-                       }
-                       else
-                       {
-                               StrBufRFC2047encode(&Encoded,
-                                                   ri->author_or_creator);
-                               SaveMsg->Msg.cm_fields['A'] =
-                                       SmashStrBuf(&Encoded);
-                               SaveMsg->Msg.cm_fields['P'] =
-                                       strdup("rss@localhost");
-
-                       }
-                       if (ri->pubdate <= 0) {
-                               ri->pubdate = time(NULL);
-                       }
-               }
-       }
-       else {
-               SaveMsg->Msg.cm_fields['A'] = strdup("rss");
+       if (ri->guid != NULL) {
+               CM_SetField(&SaveMsg->Msg, eExclusiveID, SKEY(ri->guid));
        }
 
-       SaveMsg->Msg.cm_fields['N'] = strdup(NODENAME);
-       if (ri->title != NULL) {
-               long len;
-               char *Sbj;
-               StrBuf *Encoded, *QPEncoded;
-
-               QPEncoded = NULL;
-               StrBufSpaceToBlank(ri->title);
-               len = StrLength(ri->title);
-               Sbj = html_to_ascii(ChrPtr(ri->title), len, 512, 0);
-               len = strlen(Sbj);
-               if ((len > 0) && (Sbj[len - 1] == '\n'))
-               {
-                       len --;
-                       Sbj[len] = '\0';
-               }
-               Encoded = NewStrBufPlain(Sbj, len);
-               free(Sbj);
-
-               StrBufTrim(Encoded);
-               StrBufRFC2047encode(&QPEncoded, Encoded);
+       SaveMsg->MsgGUID = guid;
 
-               SaveMsg->Msg.cm_fields['U'] = SmashStrBuf(&QPEncoded);
-               FreeStrBuf(&Encoded);
+       if (ri->pubdate <= 0) {
+               ri->pubdate = time(NULL);
        }
-       SaveMsg->Msg.cm_fields['T'] = malloc(64);
-       snprintf(SaveMsg->Msg.cm_fields['T'], 64, "%ld", ri->pubdate);
+       CM_SetFieldLONG(&SaveMsg->Msg, eTimestamp, ri->pubdate);
        if (ri->channel_title != NULL) {
                if (StrLength(ri->channel_title) > 0) {
-                       SaveMsg->Msg.cm_fields['O'] =
-                               strdup(ChrPtr(ri->channel_title));
+                       CM_SetField(&SaveMsg->Msg, eOriginalRoom, SKEY(ri->channel_title));
                }
        }
-       if (ri->link == NULL)
-               ri->link = NewStrBufPlain(HKEY(""));
 
-#if 0 /* temporarily disable shorter urls. */
-       SaveMsg->Msg.cm_fields[TMP_SHORTER_URLS] =
-               GetShorterUrls(ri->description);
-#endif
+       /* remember the ones for defferred processing to save computing power after we know if we realy need it. */
 
-       msglen += 1024 + StrLength(ri->link) + StrLength(ri->description) ;
+       SaveMsg->author_or_creator = ri->author_or_creator;
+       ri->author_or_creator = NULL;
 
-       Message = NewStrBufPlain(NULL, msglen);
+       SaveMsg->author_email = ri->author_email;
+       ri->author_email = NULL;
 
-       StrBufPlain(Message, HKEY(
-                           "Content-type: text/html; charset=\"UTF-8\"\r\n\r\n"
-                           "<html><body>\n"));
-#if 0 /* disable shorter url for now. */
-       SaveMsg->Msg.cm_fields[TMP_SHORTER_URL_OFFSET] = StrLength(Message);
-#endif
-       StrBufAppendBuf(Message, ri->description, 0);
-       StrBufAppendBufPlain(Message, HKEY("<br><br>\n"), 0);
+       SaveMsg->title = ri->title;
+       ri->title = NULL;
 
-       AppendLink(Message, ri->link, ri->linkTitle, NULL);
-       AppendLink(Message, ri->reLink, ri->reLinkTitle, "Reply to this");
-       StrBufAppendBufPlain(Message, HKEY("</body></html>\n"), 0);
+       SaveMsg->link = ri->link;
+       ri->link = NULL;
 
-       SaveMsg->MsgGUID = guid;
-       SaveMsg->Message = Message;
+       SaveMsg->description = ri->description;
+       ri->description = NULL;
+
+       SaveMsg->linkTitle = ri->linkTitle;
+       ri->linkTitle = NULL;
+
+       SaveMsg->reLink = ri->reLink;
+       ri->reLink = NULL;
+
+       SaveMsg->reLinkTitle = ri->reLinkTitle;
+       ri->reLinkTitle = NULL;
 
        n = GetCount(RSSAggr->Messages) + 1;
        Put(RSSAggr->Messages, IKEY(n), SaveMsg, FreeNetworkSaveMessage);
 }
 
 
+
 void rss_xml_start(void *data, const char *supplied_el, const char **attr)
 {
        rss_xml_handler *h;
        rss_aggregator  *RSSAggr = (rss_aggregator*) data;
-       AsyncIO         *IO = &RSSAggr->IO;
        rss_item        *ri = RSSAggr->Item;
        void            *pv;
        const char      *pel;
@@ -829,10 +713,11 @@ void rss_xml_start(void *data, const char *supplied_el, const char **attr)
                             pel - supplied_el - 1,
                             &v))
                {
-                       EVRSSATOM_syslog(LOG_DEBUG,
+                       syslog(LOG_DEBUG,
                                         "RSS: START ignoring "
-                                        "because of wrong namespace [%s]\n",
-                                        supplied_el);
+                                        "because of wrong namespace [%s]",
+                                        supplied_el
+                       );
                        return;
                }
        }
@@ -862,23 +747,24 @@ void rss_xml_start(void *data, const char *supplied_el, const char **attr)
                                   attr);
                }
                else
-                       EVRSSATOM_syslog(LOG_DEBUG,
-                                         "RSS: START unhandled: [%s] [%s]...\n",
+                       syslog(LOG_DEBUG,
+                                         "RSS: START unhandled: [%s] [%s]...",
                                         pel,
-                                        supplied_el);
+                                        supplied_el
+                       );
        }
        else
-               EVRSSATOM_syslog(LOG_DEBUG,
-                                "RSS: START unhandled: [%s] [%s]...\n",
+               syslog(LOG_DEBUG,
+                                "RSS: START unhandled: [%s] [%s]...",
                                 pel,
-                                supplied_el);
+                                supplied_el
+               );
 }
 
 void rss_xml_end(void *data, const char *supplied_el)
 {
        rss_xml_handler *h;
        rss_aggregator  *RSSAggr = (rss_aggregator*) data;
-       AsyncIO         *IO = &RSSAggr->IO;
        rss_item        *ri = RSSAggr->Item;
        const char      *pel;
        char            *sep = NULL;
@@ -889,7 +775,7 @@ void rss_xml_end(void *data, const char *supplied_el)
        while (sep = strchr(pel, ':'), sep) {
                pel = sep + 1;
        }
-       EVRSSATOM_syslog(LOG_DEBUG, "RSS: END %s...\n", supplied_el);
+       syslog(LOG_DEBUG, "RSS: END %s...", supplied_el);
        if (pel != supplied_el)
        {
                void *v;
@@ -899,9 +785,9 @@ void rss_xml_end(void *data, const char *supplied_el)
                             pel - supplied_el - 1,
                             &v))
                {
-                       EVRSSATOM_syslog(LOG_DEBUG,
+                       syslog(LOG_DEBUG,
                                         "RSS: END ignoring because of wrong namespace"
-                                        "[%s] = [%s]\n",
+                                        "[%s] = [%s]",
                                         supplied_el,
                                         ChrPtr(RSSAggr->CData));
                        FlushStrBuf(RSSAggr->CData);
@@ -931,15 +817,15 @@ void rss_xml_end(void *data, const char *supplied_el)
                        h->Handler(RSSAggr->CData, ri, RSSAggr, NULL);
                }
                else
-                       EVRSSATOM_syslog(LOG_DEBUG,
-                                        "RSS: END   unhandled: [%s]  [%s] = [%s]...\n",
+                       syslog(LOG_DEBUG,
+                                        "RSS: END   unhandled: [%s]  [%s] = [%s]...",
                                         pel,
                                         supplied_el,
                                         ChrPtr(RSSAggr->CData));
        }
        else
-               EVRSSATOM_syslog(LOG_DEBUG,
-                                "RSS: END   unhandled: [%s]  [%s] = [%s]...\n",
+               syslog(LOG_DEBUG,
+                                "RSS: END   unhandled: [%s]  [%s] = [%s]...",
                                 pel,
                                 supplied_el,
                                 ChrPtr(RSSAggr->CData));
@@ -948,18 +834,6 @@ void rss_xml_end(void *data, const char *supplied_el)
 
 
 
-/*
- * Callback function for passing libcurl's output to expat for parsing
- * we don't do streamed parsing so expat can handle non-utf8 documents
-size_t rss_libcurl_callback(void *ptr, size_t size, size_t nmemb, void *stream)
-{
-       XML_Parse((XML_Parser)stream, ptr, (size * nmemb), 0);
-       return (size*nmemb);
-}
- */
-
-
-
 eNextState RSSAggregator_ParseReply(AsyncIO *IO)
 {
        StrBuf *Buf;
@@ -994,11 +868,11 @@ eNextState RSSAggregator_ParseReply(AsyncIO *IO)
        else
                ptr = "UTF-8";
 
-       EVRSSATOM_syslog(LOG_DEBUG, "RSS: Now parsing [%s] \n", ChrPtr(RSSAggr->Url));
+       syslog(LOG_DEBUG, "RSS: Now parsing [%s]", ChrPtr(RSSAggr->Url));
 
        RSSAggr->xp = XML_ParserCreateNS(ptr, ':');
        if (!RSSAggr->xp) {
-               EVRSSATOMM_syslog(LOG_ALERT, "Cannot create XML parser!\n");
+               syslog(LOG_ALERT, "Cannot create XML parser!");
                return eAbort;
        }
        FlushStrBuf(RSSAggr->Key);
@@ -1009,19 +883,18 @@ eNextState RSSAggregator_ParseReply(AsyncIO *IO)
        XML_SetUserData(RSSAggr->xp, RSSAggr);
        XML_SetCdataSectionHandler(RSSAggr->xp,
                                   rss_xml_cdata_start,
-                                  rss_xml_cdata_end);
-
+                                  rss_xml_cdata_end
+       );
 
        len = StrLength(IO->HttpReq.ReplyData);
        ptr = SmashStrBuf(&IO->HttpReq.ReplyData);
        XML_Parse(RSSAggr->xp, ptr, len, 0);
        free (ptr);
-       if (ri->done_parsing == 0)
+       if (ri->done_parsing == 0) {
                XML_Parse(RSSAggr->xp, "", 0, 1);
+       }
 
-
-       EVRSSATOM_syslog(LOG_DEBUG, "RSS: XML Status [%s] \n",
-                        XML_ErrorString(XML_GetErrorCode(RSSAggr->xp)));
+       syslog(LOG_DEBUG, "RSS: XML Status [%s]", XML_ErrorString(XML_GetErrorCode(RSSAggr->xp)));
 
        XML_ParserFree(RSSAggr->xp);
        flush_rss_item(ri);
@@ -1033,15 +906,21 @@ eNextState RSSAggregator_ParseReply(AsyncIO *IO)
 
        RSSAggr->Pos = GetNewHashPos(RSSAggr->Messages, 1);
 
-//RSSAggr->next_poll = time(NULL) + config.c_net_freq;
+#if 0
+// FIXME ajc
        if (GetNextHashPos(RSSAggr->Messages,
                           RSSAggr->Pos,
                           &len,
                           &Key,
-                          (void**) &RSSAggr->ThisMsg))
+                          (void**) &RSSAggr->ThisMsg)) {
                return NextDBOperation(IO, RSS_FetchNetworkUsetableEntry);
-       else
+       }
+       else {
+#endif
                return eAbort;
+#if 0
+       }
+#endif
 }