From: Wilfried Goesgens Date: Sun, 5 Aug 2012 17:13:35 +0000 (+0200) Subject: RSS-CLIENT: remember checksum for whole feed, so we can save parsing work if we know... X-Git-Tag: v8.20~268 X-Git-Url: https://code.citadel.org/?p=citadel.git;a=commitdiff_plain;h=e296d96701baded9253071b142887af824a1db84 RSS-CLIENT: remember checksum for whole feed, so we can save parsing work if we know that content. --- diff --git a/citadel/modules/rssclient/rss_atom_parser.c b/citadel/modules/rssclient/rss_atom_parser.c index 863462962..c1f528580 100644 --- a/citadel/modules/rssclient/rss_atom_parser.c +++ b/citadel/modules/rssclient/rss_atom_parser.c @@ -946,6 +946,8 @@ void rss_xml_end(void *data, const char *supplied_el) FlushStrBuf(RSSAggr->CData); } + + /* * Callback function for passing libcurl's output to expat for parsing * we don't do streamed parsing so expat can handle non-utf8 documents @@ -956,6 +958,8 @@ size_t rss_libcurl_callback(void *ptr, size_t size, size_t nmemb, void *stream) } */ + + eNextState RSSAggregator_ParseReply(AsyncIO *IO) { StrBuf *Buf; @@ -966,16 +970,6 @@ eNextState RSSAggregator_ParseReply(AsyncIO *IO) long len; const char *Key; - - if (IO->HttpReq.httpcode != 200) - { - - EVRSSATOM_syslog(LOG_ALERT, "need a 200, got a %ld !\n", - IO->HttpReq.httpcode); -// TODO: aide error message with rate limit - return eAbort; - } - RSSAggr = IO->Data; ri = RSSAggr->Item; RSSAggr->CData = NewStrBufPlain(NULL, SIZ); @@ -1045,7 +1039,7 @@ eNextState RSSAggregator_ParseReply(AsyncIO *IO) &len, &Key, (void**) &RSSAggr->ThisMsg)) - return QueueDBOperation(IO, RSS_FetchNetworkUsetableEntry); + return NextDBOperation(IO, RSS_FetchNetworkUsetableEntry); else return eAbort; } diff --git a/citadel/modules/rssclient/serv_rssclient.c b/citadel/modules/rssclient/serv_rssclient.c index 7bde9791d..68b70a265 100644 --- a/citadel/modules/rssclient/serv_rssclient.c +++ b/citadel/modules/rssclient/serv_rssclient.c @@ -299,6 +299,68 @@ eNextState RSS_FetchNetworkUsetableEntry(AsyncIO *IO) } } +eNextState RSSAggregator_AnalyseReply(AsyncIO *IO) +{ + struct UseTable ut; + u_char rawdigest[MD5_DIGEST_LEN]; + struct MD5Context md5context; + StrBuf *guid; + struct cdbdata *cdbut; + rss_aggregator *Ctx = (rss_aggregator *) IO->Data; + + if (IO->HttpReq.httpcode != 200) + { + + EVRSSC_syslog(LOG_ALERT, "need a 200, got a %ld !\n", + IO->HttpReq.httpcode); +// TODO: aide error message with rate limit + return eAbort; + } + + MD5Init(&md5context); + + MD5Update(&md5context, + (const unsigned char*)SKEY(IO->HttpReq.ReplyData)); + + MD5Update(&md5context, + (const unsigned char*)SKEY(Ctx->Url)); + + MD5Final(rawdigest, &md5context); + guid = NewStrBufPlain(NULL, + MD5_DIGEST_LEN * 2 + 12 /* _rss2ctdl*/); + StrBufHexEscAppend(guid, NULL, rawdigest, MD5_DIGEST_LEN); + StrBufAppendBufPlain(guid, HKEY("_rssFM"), 0); + if (StrLength(guid) > 40) + StrBufCutAt(guid, 40, NULL); + /* Find out if we've already seen this item */ + memcpy(ut.ut_msgid, SKEY(guid)); + ut.ut_timestamp = time(NULL); + + cdbut = cdb_fetch(CDB_USETABLE, SKEY(guid)); +#ifndef DEBUG_RSS + if (cdbut != NULL) { + /* Item has already been seen */ + EVRSSC_syslog(LOG_DEBUG, + "%s has already been seen\n", + ChrPtr(Ctx->Url)); + cdb_free(cdbut); + } + + /* rewrite the record anyway, to update the timestamp */ + cdb_store(CDB_USETABLE, + SKEY(guid), + &ut, sizeof(struct UseTable) ); + + if (cdbut != NULL) return eAbort; +#endif + return RSSAggregator_ParseReply(IO); +} + +eNextState RSSAggregator_FinishHttp(AsyncIO *IO) +{ + return QueueDBOperation(IO, RSSAggregator_AnalyseReply); +} + /* * Begin a feed parse */ @@ -320,7 +382,7 @@ int rss_do_fetching(rss_aggregator *RSSAggr) if (! InitcURLIOStruct(&RSSAggr->IO, RSSAggr, "Citadel RSS Client", - RSSAggregator_ParseReply, + RSSAggregator_FinishHttp, RSSAggregator_Terminate, RSSAggregator_TerminateDB, RSSAggregator_ShutdownAbort))