#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.3 1998/10/15 13:27:19 adam Exp $
+# $Id: robot.tcl,v 1.4 1999/02/04 20:37:25 perhans Exp $
#
proc RobotFileNext {area} {
if {[catch {set ns [glob ${area}/*]}]} {
proc headSave {url out title} {
global URL
- puts $out {<nwi>}
- puts $out "<ti> $title"
- if {[info exists URL($url,head,Last-modified)]} {
- puts $out "<dm> $URL($url,head,Last-modified)"
+ puts $out {<meta>}
+ puts $out "<title>$title</title>"
+ if {[info exists URL($url,head,last-modified)]} {
+ puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
}
puts $out {<si>}
- if {[info exists URL($url,head,Date)]} {
- puts $out " <lc> $URL($url,head,Date)"
+ if {[info exists URL($url,head,date)]} {
+ puts $out " <date>$URL($url,head,date)</date>"
}
- if {[info exists URL($url,head,Content-length)]} {
- puts $out " <by> $URL($url,head,Content-length)"
+ if {[info exists URL($url,head,content-length)]} {
+ puts $out " <by>$URL($url,head,content-length)</by>"
}
- if {[info exists URL($url,head,Server)]} {
- puts $out " <srvr> $URL($url,head,Server)"
+ if {[info exists URL($url,head,server)]} {
+ puts $out " <format>$URL($url,head,server)</format>"
}
puts $out {</si>}
- puts $out {<av>}
- puts $out " <avli> $url"
- if {[info exists URL($url,head,Content-type)]} {
- puts $out " <ty> $URL($url,head,Content-type)"
+ puts $out {<publisher>}
+ puts $out " <identifier>$url</identifier>"
+ if {[info exists URL($url,head,content-type)]} {
+ puts $out " <type>$URL($url,head,content-type)</type>"
}
- puts $out {</av>}
+ puts $out {</publisher>}
}
proc RobotSave {url} {
- global URL
- global domains
+ global URL domains
set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
set ti 0
} body {
regsub -all -nocase {<script.*</script>} $body {} abody
regsub -all {<[^\>]+>} $abody {} nbody
- puts $out "<body>"
+ puts $out "<documentcontent>"
puts $out $nbody
- puts $out "</body>"
+ puts $out "</documentcontent>"
} a {
if {![info exists parm(href)]} {
puts "no href"
set path [lindex $c $i]
incr i -1
while {$i >= 0} {
- switch -- [lindex $c $i] {
- .. {
- incr i -2
- }
- . {
- incr i -1
- }
- default {
- set path [lindex $c $i]/$path
- incr i -1
- }
- }
- }
- set href "$method://$host$path"
+ switch -- [lindex $c $i] {
+ .. {
+ incr i -2
+ }
+ . {
+ incr i -1
+ }
+ default {
+ set path [lindex $c $i]/$path
+ incr i -1
+ }
+ }
+ }
+ set href "$method://$host$path"
- puts $out "<cr>"
- puts $out "<li> $href"
- puts $out "<cp> $body"
+ puts $out "<cr>"
+ puts $out "<identifier>$href</identifier>"
+ puts $out "<description>$body</description>"
puts $out "</cr>"
if {![regexp {/.*bin/} $href)]} {
headSave $url $out "untitled"
set ti 1
}
- puts $out "</nwi>"
+ puts $out "</meta>"
close $out
RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
}
head {
puts "head: $line"
if {[regexp {([^:]+):[ ]+(.*)} $line x name value]} {
- set URL($url,head,$name) $value
+ set URL($url,head,[string tolower $name]) $value
}
}
html {
lappend URL($url,line) $line
-# puts "body: $line"
}
skip {
close $sock
}
} else {
set URL($url,state) html
- if {[info exists URL($url,head,Content-type)]} {
- if {![string compare $URL($url,head,Content-type) text/html]} {
+ if {[info exists URL($url,head,content-type)]} {
+ if {![string compare $URL($url,head,content-type) text/html]} {
set URL($url,state) html
}
}
if {![llength [info commands htmlSwitch]]} {
set e [info sharedlibextension]
if {[catch {load ./tclrobot$e}]} {
- load tclrobot$e
+ load tclrobot$e
}
}
if {[llength $argv] < 2} {
puts "Tclrobot: usage <domain> <start>"
+ puts " Example: '*.dk' www.indexdata.dk"
exit 1
}
set domains [lindex $argv 0]
RobotRestart
vwait forever
-