RSS-CLIENT: remember checksum for whole feed, so we can save parsing work if we know...
authorWilfried Goesgens <dothebart@citadel.org>
Sun, 5 Aug 2012 17:13:35 +0000 (19:13 +0200)
committerWilfried Goesgens <dothebart@citadel.org>
Sun, 5 Aug 2012 17:13:35 +0000 (19:13 +0200)
citadel/modules/rssclient/rss_atom_parser.c
citadel/modules/rssclient/serv_rssclient.c

index 86346296244eb3504d1f7b5bccaa532b9b805a50..c1f5285804341e07aaa525ed0076c271fa0d8d07 100644 (file)
@@ -946,6 +946,8 @@ void rss_xml_end(void *data, const char *supplied_el)
        FlushStrBuf(RSSAggr->CData);
 }
 
+
+
 /*
  * Callback function for passing libcurl's output to expat for parsing
  * we don't do streamed parsing so expat can handle non-utf8 documents
@@ -956,6 +958,8 @@ size_t rss_libcurl_callback(void *ptr, size_t size, size_t nmemb, void *stream)
 }
  */
 
+
+
 eNextState RSSAggregator_ParseReply(AsyncIO *IO)
 {
        StrBuf *Buf;
@@ -966,16 +970,6 @@ eNextState RSSAggregator_ParseReply(AsyncIO *IO)
        long len;
        const char *Key;
 
-
-       if (IO->HttpReq.httpcode != 200)
-       {
-
-               EVRSSATOM_syslog(LOG_ALERT, "need a 200, got a %ld !\n",
-                                IO->HttpReq.httpcode);
-// TODO: aide error message with rate limit
-               return eAbort;
-       }
-
        RSSAggr = IO->Data;
        ri = RSSAggr->Item;
        RSSAggr->CData = NewStrBufPlain(NULL, SIZ);
@@ -1045,7 +1039,7 @@ eNextState RSSAggregator_ParseReply(AsyncIO *IO)
                           &len,
                           &Key,
                           (void**) &RSSAggr->ThisMsg))
-               return QueueDBOperation(IO, RSS_FetchNetworkUsetableEntry);
+               return NextDBOperation(IO, RSS_FetchNetworkUsetableEntry);
        else
                return eAbort;
 }
index 7bde9791d857f4ee029d423e7b8bd78abcf9135f..68b70a2652c5037997721c950bfe0d53783d6a03 100644 (file)
@@ -299,6 +299,68 @@ eNextState RSS_FetchNetworkUsetableEntry(AsyncIO *IO)
        }
 }
 
+eNextState RSSAggregator_AnalyseReply(AsyncIO *IO)
+{
+       struct UseTable ut;
+       u_char rawdigest[MD5_DIGEST_LEN];
+       struct MD5Context md5context;
+       StrBuf *guid;
+       struct cdbdata *cdbut;
+       rss_aggregator *Ctx = (rss_aggregator *) IO->Data;
+
+       if (IO->HttpReq.httpcode != 200)
+       {
+
+               EVRSSC_syslog(LOG_ALERT, "need a 200, got a %ld !\n",
+                             IO->HttpReq.httpcode);
+// TODO: aide error message with rate limit
+               return eAbort;
+       }
+
+       MD5Init(&md5context);
+
+       MD5Update(&md5context,
+                 (const unsigned char*)SKEY(IO->HttpReq.ReplyData));
+
+       MD5Update(&md5context,
+                 (const unsigned char*)SKEY(Ctx->Url));
+
+       MD5Final(rawdigest, &md5context);
+       guid = NewStrBufPlain(NULL,
+                             MD5_DIGEST_LEN * 2 + 12 /* _rss2ctdl*/);
+       StrBufHexEscAppend(guid, NULL, rawdigest, MD5_DIGEST_LEN);
+       StrBufAppendBufPlain(guid, HKEY("_rssFM"), 0);
+       if (StrLength(guid) > 40)
+               StrBufCutAt(guid, 40, NULL);
+       /* Find out if we've already seen this item */
+       memcpy(ut.ut_msgid, SKEY(guid));
+       ut.ut_timestamp = time(NULL);
+
+       cdbut = cdb_fetch(CDB_USETABLE, SKEY(guid));
+#ifndef DEBUG_RSS
+       if (cdbut != NULL) {
+               /* Item has already been seen */
+               EVRSSC_syslog(LOG_DEBUG,
+                             "%s has already been seen\n",
+                             ChrPtr(Ctx->Url));
+               cdb_free(cdbut);
+       }
+
+       /* rewrite the record anyway, to update the timestamp */
+       cdb_store(CDB_USETABLE,
+                 SKEY(guid),
+                 &ut, sizeof(struct UseTable) );
+
+       if (cdbut != NULL) return eAbort;
+#endif
+       return RSSAggregator_ParseReply(IO);
+}
+
+eNextState RSSAggregator_FinishHttp(AsyncIO *IO)
+{
+       return QueueDBOperation(IO, RSSAggregator_AnalyseReply);
+}
+
 /*
  * Begin a feed parse
  */
@@ -320,7 +382,7 @@ int rss_do_fetching(rss_aggregator *RSSAggr)
        if (! InitcURLIOStruct(&RSSAggr->IO,
                               RSSAggr,
                               "Citadel RSS Client",
-                              RSSAggregator_ParseReply,
+                              RSSAggregator_FinishHttp,
                               RSSAggregator_Terminate,
                               RSSAggregator_TerminateDB,
                               RSSAggregator_ShutdownAbort))