While hunting for an internet address bug, cleaning up more style.
[citadel.git] / citadel / contrib / getdoku.sh
1 #!/bin/bash
2
3 BASE_SITE=http://www.citadel.org
4
5
6
7 #retrieves an index document from the citadel.org website, and filters it 
8 # 1: URL
9 # 2: outfile where to put the filtered content at
10 GetIndex()
11 {
12   cd /tmp/; wget -q "${BASE_SITE}/${1}"
13   cat "/tmp/${1}"   | \
14     grep /doku.php/ | \
15     grep -v "do="   | \
16     sed -e "s;.*href=\";;" \
17         -e "s;\" .*;;" \
18         -e "s;doku.php/;doku.php?id=;"| \
19     grep "^/doku" > \
20     "/tmp/$2"
21 }
22
23 rm -f /tmp/mainindex /tmp/doku.php*
24 GetIndex "doku.php?id=faq:start" mainindex
25
26 for i in `cat /tmp/mainindex`; do 
27     TMPNAME=`echo $i|sed "s;.*=;;"`
28     echo $i $TMPNAME
29     mkdir /tmp/$TMPNAME
30     GetIndex "$i" "$TMPNAME/$TMPNAME"
31     for j in `cat /tmp/$TMPNAME/$TMPNAME`; do
32         echo "-----------$j----------------"
33         cd /tmp/$TMPNAME/; 
34         DOCUMENT_NAME=`echo $j|sed -e "s;/doku.php?id=.*:;;"`
35         PLAIN_NAME=`grep "$DOCUMENT_NAME" /tmp/doku*$TMPNAME |head -n1  |sed -e "s;','/doku.*;;" -e "s;.*';;"`
36
37         echo "********** retrieving $DOCUMENT_NAME ************"
38         wget -q "${BASE_SITE}/${j}&do=export_xhtmlbody"
39         mv "/tmp/$TMPNAME/${j}&do=export_xhtmlbody" /tmp/$TMPNAME/$DOCUMENT_NAME
40
41         echo "<li><a href=\"#$DOCUMENT_NAME\">$PLAIN_NAME</a></li>" >>collect_index
42         echo "<a name=\"$DOCUMENT_NAME\"></a>" >>collect_bodies
43         cat $DOCUMENT_NAME>>collect_bodies
44     done
45     (
46         echo "<html><head>$TMPNAME</head><body><ul>"
47         cat "/tmp/$TMPNAME/collect_index"
48         echo "<hr></ul>"
49         cat "/tmp/$TMPNAME/collect_bodies"
50         echo "</body></html>"
51         ) >/tmp/`echo $TMPNAME|sed "s;:;_;g"`.html
52 done