4 * Copyright 2003-2004 Rene Puls <rpuls@gmx.net> and
5 * Oliver Feiler <kiza@kcore.de>
7 * http://kiza.kcore.de/software/snownews/
8 * http://home.kcore.de/~kianga/study/c/xmlparse.c
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License version 2 as
14 * published by the Free Software Foundation.
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
32 #include "conversions.h"
34 #include "parsedate.h"
35 #include "rdf_parsedate.h"
38 struct newsitem *copy;
39 struct newsitem *firstcopy;
41 /* During the parsens one calls, if we meet a <channel> element.
42 * The function returns a new Struct for the new feed. */
44 void parse_rdf10_channel(struct feed *feed, xmlDocPtr doc, xmlNodePtr node) {
47 /* Free everything before we write to it again. */
51 if (feed->items != NULL) {
52 while (feed->items->next_ptr != NULL) {
53 feed->items = feed->items->next_ptr;
54 free (feed->items->prev_ptr->data->title);
55 free (feed->items->prev_ptr->data->link);
56 free (feed->items->prev_ptr->data->guid);
57 free (feed->items->prev_ptr->data->description);
58 free (feed->items->prev_ptr->data);
59 free (feed->items->prev_ptr);
61 free (feed->items->data->title);
62 free (feed->items->data->link);
63 free (feed->items->data->guid);
64 free (feed->items->data->description);
65 free (feed->items->data);
69 /* At the moment we have still no Items, so set the list to null. */
73 feed->description = NULL;
75 /* Go through all <channel> tags and extract the information */
76 for (cur = node; cur != NULL; cur = cur->next) {
77 if (cur->type != XML_ELEMENT_NODE)
79 if (xmlStrcmp(cur->name, "title") == 0) {
80 feed->title = xmlNodeListGetString(doc, cur->children, 1);
81 CleanupString (feed->title, 1);
82 /* Remove trailing newline */
83 if (feed->title != NULL) {
84 if (strlen(feed->title) > 1) {
85 if (feed->title[strlen(feed->title)-1] == '\n')
86 feed->title[strlen(feed->title)-1] = '\0';
90 else if (xmlStrcmp(cur->name, "link") == 0) {
91 feed->link = xmlNodeListGetString(doc, cur->children, 1);
92 /* Remove trailing newline */
93 if (feed->link != NULL) {
94 if (strlen(feed->link) > 1) {
95 if (feed->link[strlen(feed->link)-1] == '\n')
96 feed->link[strlen(feed->link)-1] = '\0';
100 else if (xmlStrcmp(cur->name, "description") == 0) {
101 feed->description = xmlNodeListGetString(doc, cur->children, 1);
102 CleanupString (feed->description, 0);
108 void parse_rdf20_channel(struct feed *feed, xmlDocPtr doc, xmlNodePtr node)
112 /* Free everything before we write to it again. */
115 free (feed->description);
117 if (feed->items != NULL) {
118 while (feed->items->next_ptr != NULL) {
119 feed->items = feed->items->next_ptr;
120 free (feed->items->prev_ptr->data->title);
121 free (feed->items->prev_ptr->data->link);
122 free (feed->items->prev_ptr->data->guid);
123 free (feed->items->prev_ptr->data->description);
124 free (feed->items->prev_ptr->data);
125 free (feed->items->prev_ptr);
127 free (feed->items->data->title);
128 free (feed->items->data->link);
129 free (feed->items->data->guid);
130 free (feed->items->data->description);
131 free (feed->items->data);
135 /* At the moment we have still no Items, so set the list to NULL. */
139 feed->description = NULL;
141 /* Go through all tags in <channel> and extract the information. */
142 for (cur = node; cur != NULL; cur = cur->next) {
143 if (cur->type != XML_ELEMENT_NODE)
145 if (xmlStrcmp(cur->name, "title") == 0) {
146 feed->title = xmlNodeListGetString(doc, cur->children, 1);
147 CleanupString (feed->title, 1);
148 /* Remove trailing newline */
149 if (feed->title != NULL) {
150 if (strlen(feed->title) > 1) {
151 if (feed->title[strlen(feed->title)-1] == '\n')
152 feed->title[strlen(feed->title)-1] = '\0';
156 else if (xmlStrcmp(cur->name, "link") == 0) {
157 feed->link = xmlNodeListGetString(doc, cur->children, 1);
158 /* Remove trailing newline */
159 if (feed->link != NULL) {
160 if (strlen(feed->link) > 1) {
161 if (feed->link[strlen(feed->link)-1] == '\n')
162 feed->link[strlen(feed->link)-1] = '\0';
166 else if (xmlStrcmp(cur->name, "description") == 0) {
167 feed->description = xmlNodeListGetString(doc, cur->children, 1);
168 CleanupString (feed->description, 0);
169 } else if (xmlStrcmp(cur->name, "item") == 0) {
170 parse_rdf10_item(feed, doc, cur->children);
175 /* This function is called each mark, if we meet on. As parameter it needs the
176 * current new feed (new feed struct *), as well as the current XML
177 * document-acts and the current element, both comes directly of libxml.
180 void parse_rdf10_item(struct feed *feed, xmlDocPtr doc, xmlNodePtr node)
183 xmlChar *readstatusstring;
185 struct newsitem *item;
186 struct newsitem *current;
188 /* Speicher für ein neues Newsitem reservieren */
189 item = malloc(sizeof (struct newsitem));
190 item->data = malloc (sizeof (struct newsdata));
192 item->data->title = NULL;
193 item->data->link = NULL;
194 item->data->guid = NULL;
195 item->data->description = NULL;
196 item->data->date = 0L;
197 item->data->readstatus = 0;
198 item->data->parent = feed;
200 /* Alle Tags im <item> Tag durchgehen und die Informationen extrahieren.
201 Selbe Vorgehensweise wie in der parse_channel() Funktion */
202 for (cur = node; cur != NULL; cur = cur->next) {
203 if (cur->type != XML_ELEMENT_NODE)
205 if (xmlStrcmp(cur->name, "title") == 0) {
206 item->data->title = xmlNodeListGetString(doc, cur->children, 1);
207 CleanupString (item->data->title, 1);
208 /* Remove trailing newline */
209 if (item->data->title != NULL) {
210 if (strlen(item->data->title) > 1) {
211 if (item->data->title[strlen(item->data->title)-1] == '\n')
212 item->data->title[strlen(item->data->title)-1] = '\0';
216 else if (xmlStrcmp(cur->name, "link") == 0) {
217 item->data->link = xmlNodeListGetString(doc, cur->children, 1);
218 if (item->data->link == NULL) {
219 if (xmlStrcmp(cur->name, "guid") == 0)
220 item->data->link = xmlNodeListGetString(doc, cur->children, 1);
222 /* Remove trailing newline */
223 if (item->data->link != NULL) {
224 if (strlen(item->data->link) > 1) {
225 if (item->data->link[strlen(item->data->link)-1] == '\n')
226 item->data->link[strlen(item->data->link)-1] = '\0';
230 else if (xmlStrcmp(cur->name, "guid") == 0) {
231 item->data->guid = xmlNodeListGetString(doc, cur->children, 1);
232 if (item->data->guid == NULL) {
233 if (xmlStrcmp(cur->name, "guid") == 0)
234 item->data->guid = xmlNodeListGetString(doc, cur->children, 1);
236 /* Remove trailing newline */
237 if (item->data->guid != NULL) {
238 if (strlen(item->data->guid) > 1) {
239 if (item->data->guid[strlen(item->data->guid)-1] == '\n')
240 item->data->guid[strlen(item->data->guid)-1] = '\0';
244 else if (xmlStrcmp(cur->name, "description") == 0) {
245 item->data->description = xmlNodeListGetString(doc, cur->children, 1);
246 CleanupString (item->data->description, 0);
248 /* pubDate will be in the form of: Thu, 15 Sep 2005 14:32:44 +0000 */
249 else if (xmlStrcmp(cur->name, "pubDate") == 0) {
250 item->data->date = parsedate(xmlNodeListGetString(doc, cur->children, 1));
252 /* RSS style date will be in the form of: 2005-09-17T06:18:00+00:00
253 * Only use it if no pubDate was already found.
255 else if (xmlStrcmp(cur->name, "date") == 0) {
256 if (item->data->date <= 0L) {
257 item->data->date = rdf_parsedate(xmlNodeListGetString(doc, cur->children, 1));
260 else if (xmlStrcmp(cur->name, "readstatus") == 0) {
261 /* Will cause memory leak otherwise, xmlNodeListGetString must be freed. */
262 readstatusstring = xmlNodeListGetString(doc, cur->children, 1);
263 item->data->readstatus = atoi (readstatusstring);
264 xmlFree (readstatusstring);
268 /* If saverestore == 1, restore readstatus. */
269 if (saverestore == 1) {
270 for (current = firstcopy; current != NULL; current = current->next_ptr) {
271 if ((current->data->link != NULL) && (item->data->link != NULL)) {
272 if ((current->data->title != NULL) && (item->data->title != NULL)) {
273 if ((strcmp(item->data->link, current->data->link) == 0) &&
274 (strcmp(item->data->title, current->data->title) == 0))
275 item->data->readstatus = current->data->readstatus;
277 if (strcmp(item->data->link, current->data->link) == 0)
278 item->data->readstatus = current->data->readstatus;
284 item->next_ptr = NULL;
285 if (feed->items == NULL) {
286 item->prev_ptr = NULL;
289 item->prev_ptr = feed->items;
290 while (item->prev_ptr->next_ptr != NULL)
291 item->prev_ptr = item->prev_ptr->next_ptr;
292 item->prev_ptr->next_ptr = item;
299 int DeXML (struct feed *cur_ptr) {
302 struct newsitem *cur_item;
304 if (cur_ptr->feed == NULL)
308 /* Wenn cur_ptr->items != NULL dann können wir uns item->readstatus
309 zwischenspeichern. */
310 if (cur_ptr->items != NULL) {
315 /* Copy current newsitem struct. */
316 for (cur_item = cur_ptr->items; cur_item != NULL; cur_item = cur_item->next_ptr) {
317 copy = malloc (sizeof(struct newsitem));
318 copy->data = malloc (sizeof (struct newsdata));
319 copy->data->title = NULL;
320 copy->data->link = NULL;
321 copy->data->guid = NULL;
322 copy->data->description = NULL;
323 copy->data->date = 0L;
324 copy->data->readstatus = cur_item->data->readstatus;
325 if (cur_item->data->link != NULL)
326 copy->data->link = strdup (cur_item->data->link);
327 if (cur_item->data->title != NULL)
328 copy->data->title = strdup (cur_item->data->title);
330 copy->next_ptr = NULL;
331 if (firstcopy == NULL) {
332 copy->prev_ptr = NULL;
335 copy->prev_ptr = firstcopy;
336 while (copy->prev_ptr->next_ptr != NULL)
337 copy->prev_ptr = copy->prev_ptr->next_ptr;
338 copy->prev_ptr->next_ptr = copy;
344 parse an XML in-memory document and build a tree.
345 In case the document is not Well Formed, a tree is built anyway. */
346 doc = xmlRecoverMemory(cur_ptr->feed, strlen(cur_ptr->feed));
351 /* Das Root-Element finden (in unserem Fall sollte es "<RDF:RDF>" heißen.
352 Dabei wird das RDF: Prefix fürs Erste ignoriert, bis der Jaguar
353 herausfindet, wie man das genau auslesen kann (jau). */
354 cur = xmlDocGetRootElement(doc);
361 /* Überprüfen, ob das Element auch wirklich <RDF> heißt */
362 if (xmlStrcmp(cur->name, "RDF") == 0) {
364 /* Jetzt gehen wir alle Elemente im Dokument durch. Diese Schleife
365 selbst läuft jedoch nur durch die Elemente auf höchster Ebene
366 (bei HTML wären das nur HEAD und BODY), wandert also nicht die
367 gesamte Struktur nach unten durch. Dafür sind die Funktionen zuständig,
368 die wir dann in der Schleife selbst aufrufen. */
369 for (cur = cur->children; cur != NULL; cur = cur->next) {
370 if (cur->type != XML_ELEMENT_NODE)
372 if (xmlStrcmp(cur->name, "channel") == 0)
373 parse_rdf10_channel(cur_ptr, doc, cur->children);
374 if (xmlStrcmp(cur->name, "item") == 0)
375 parse_rdf10_item(cur_ptr, doc, cur->children);
376 /* Last-Modified is only used when reading from internal feeds (disk cache). */
377 if (xmlStrcmp(cur->name, "lastmodified") == 0)
378 cur_ptr->lastmodified = xmlNodeListGetString(doc, cur->children, 1);
380 } else if (xmlStrcmp(cur->name, "rss") == 0) {
381 for (cur = cur->children; cur != NULL; cur = cur->next) {
382 if (cur->type != XML_ELEMENT_NODE)
384 if (xmlStrcmp(cur->name, "channel") == 0)
385 parse_rdf20_channel(cur_ptr, doc, cur->children);
394 if (saverestore == 1) {
395 /* free struct newsitem *copy. */
396 while (firstcopy->next_ptr != NULL) {
397 firstcopy = firstcopy->next_ptr;
398 free (firstcopy->prev_ptr->data->link);
399 free (firstcopy->prev_ptr->data->guid);
400 free (firstcopy->prev_ptr->data->title);
401 free (firstcopy->prev_ptr->data);
402 free (firstcopy->prev_ptr);
404 free (firstcopy->data->link);
405 free (firstcopy->data->guid);
406 free (firstcopy->data->title);
407 free (firstcopy->data);
411 if (cur_ptr->original != NULL)
412 free (cur_ptr->original);
414 /* Set -> title to something if it's a NULL pointer to avoid crash with strdup below. */
415 if (cur_ptr->title == NULL)
416 cur_ptr->title = strdup (cur_ptr->feedurl);
417 cur_ptr->original = strdup (cur_ptr->title);