2 # $Id: robot.tcl,v 1.2 1998/10/15 12:31:03 adam Exp $
4 proc RobotFileNext {area} {
5 if {[catch {set ns [glob ${area}/*]}]} {
8 set off [string first / $area]
11 if {[file isfile $n]} {
12 if {[string first :.html $n] > 0} {
13 return http://[string range $area/ $off end]
15 return http://[string range $n $off end]
17 if {[file isdirectory $n]} {
18 set sb [RobotFileNext $n]
19 if {[string length $sb]} {
27 proc RobotFileExist {area host path} {
28 set comp [split $area/$host$path /]
31 if {![string length [lindex $comp $l]]} {
32 set comp [split $area/$host$path:.html /]
34 return [file exists [join $comp /]]
37 proc RobotFileUnlink {area host path} {
38 set comp [split $area/$host$path /]
41 if {![string length [lindex $comp $l]]} {
42 set comp [split $area/$host$path:.html /]
44 if {[catch {exec rm [join $comp /]}]} return
46 for {set i $l} {$i > 0} {incr i -1} {
47 set path [join [lrange $comp 0 $i] /]
48 if {![catch {glob $path/*}]} return
53 proc RobotFileOpen {area host path} {
56 set comp [split $area/$host$path /]
57 set len [llength $comp]
59 for {set i 0} {$i < $len} {incr i} {
60 set d [lindex $comp $i]
61 if {[catch {cd ./$d}]} {
66 set d [lindex $comp $len]
67 if {[string length $d]} {
70 set out [open :.html w]
76 proc RobotRestart {} {
80 set url [RobotFileNext unvisited]
81 if {![string length $url]} break
82 set r [RobotGetUrl $url {}]
86 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
92 proc headSave {url out title} {
96 puts $out "<ti> $title"
97 if {[info exists URL($url,head,Last-modified)]} {
98 puts $out "<dm> $URL($url,head,Last-modified)"
101 if {[info exists URL($url,head,Date)]} {
102 puts $out " <lc> $URL($url,head,Date)"
104 if {[info exists URL($url,head,Content-length)]} {
105 puts $out " <by> $URL($url,head,Content-length)"
107 if {[info exists URL($url,head,Server)]} {
108 puts $out " <srvr> $URL($url,head,Server)"
112 puts $out " <avli> $url"
113 if {[info exists URL($url,head,Content-type)]} {
114 puts $out " <ty> $URL($url,head,Content-type)"
119 proc RobotSave {url} {
122 set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
124 if {[info exists URL($url,line)]} {
125 set htmlContent [join $URL($url,line) \n]
127 htmlSwitch $htmlContent \
130 headSave $url $out $body
134 regsub -all -nocase {<script.*</script>} $body {} abody
135 regsub -all {<[^\>]+>} $abody {} nbody
140 if {![info exists parm(href)]} {
145 headSave $url $out "untitled"
149 if {[regexp {^\#} $parm(href)]} {
151 } elseif {[regexp {^([^:]+):([^#]+)} $parm(href) x method hpath]} {
152 if {![string compare $method http]} {
153 if {![regexp {^//([^/]+)(.*)} $hpath x host path]} {
154 set host $URL($url,host)
157 if {![regexp {\.indexdata\.dk$} $host]} continue
161 } elseif {[regexp {^([/~][^#]*)} $parm(href) x path]} {
162 set host $URL($url,host)
165 set ext [file extension $URL($url,path)]
166 if {[string compare $ext {}]} {
167 set dpart [file dirname $URL($url,path)]
169 set dpart $URL($url,path)
171 regexp {^([^#]+)} $parm(href) x path
172 set host $URL($url,host)
173 set path [string trimright $dpart /]/$path
176 set ext [file extension $path]
177 if {![string length $ext]} {
178 set path [string trimright $path /]/
180 set path [string trimright $path /]
182 set c [split $path /]
185 set path [lindex $c $i]
188 switch -- [lindex $c $i] {
196 set path [lindex $c $i]/$path
201 set href "$method://$host$path"
204 puts $out "<li> $href"
205 puts $out "<cp> $body"
208 if {![regexp {/.*bin/} $href)]} {
209 if {![RobotFileExist visited $host $path]} {
210 set outf [RobotFileOpen unvisited $host $path]
217 headSave $url $out "untitled"
222 RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
225 proc RobotRead {url sock} {
228 set readCount [gets $sock line]
229 if {$readCount < 0} {
235 } elseif {$readCount > 0} {
236 switch $URL($url,state) {
239 if {[regexp {([^:]+):[ ]+(.*)} $line x name value]} {
240 set URL($url,head,$name) $value
244 lappend URL($url,line) $line
254 set URL($url,state) html
255 if {[info exists URL($url,head,Content-type)]} {
256 if {![string compare $URL($url,head,Content-type) text/html]} {
257 set URL($url,state) html
263 proc RobotConnect {url sock} {
266 fileevent $sock readable [list RobotRead $url $sock]
267 puts $sock "GET $URL($url,path) HTTP/1.0"
276 proc RobotGetUrl {url phost} {
281 if {[regexp {([^:]+)://([^/]+)([^ ?]*)} $url x method host path]} {
282 puts "method=$method host=$host path=$path"
286 set URL($url,method) $method
287 set URL($url,host) $host
288 set URL($url,port) $port
289 set URL($url,path) $path
290 set URL($url,state) head
291 if [catch {set sock [socket -async $host $port]}] {
294 fconfigure $sock -translation {auto crlf}
295 RobotConnect $url $sock
300 if {![llength [info commands htmlSwitch]]} {
301 set e [info sharedlibextension]
302 if {[catch {load ./tclrobot$e}]} {
307 if {![llength $argv]} {
308 puts "Tclrobot: specify one or more sites."
312 set x [RobotFileOpen unvisited $site /]