#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.7 2000/12/08 22:46:53 adam Exp $
+# $Id: robot.tcl,v 1.8 2000/12/10 22:27:48 adam Exp $
#
proc RobotFileNext1 {area} {
if {[catch {set ns [glob ${area}/*]}]} {
if {![string length $n]} {
puts "------------ N E X T R O U N D --------"
set robotSeq -1
- after 2000 RobotFileWait
+ after 30000 RobotFileWait
vwait robotSeq
set n [lindex $ns $robotSeq]
if {![info exists workdir]} {
return stdout
}
- puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path"
+ puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
if {[string compare $orgPwd $workdir]} {
+ puts "ooops. RobotFileOpen failed"
puts "workdir = $workdir"
puts "pwd = $orgPwd"
exit 1
return $out
}
-proc RobotRestart {} {
+proc RobotRestart {sock} {
global URL
global robotMoreWork
-
+
+ close $sock
+ after cancel $URL($sock,cancel)
while {1} {
set url [RobotFileNext unvisited]
if {![string length $url]} {
}
set r [RobotGetUrl $url {}]
if {!$r} {
- puts "RobotGetUrl returned 0 on url=$url"
return
} else {
RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
}
}
- set robotMoreWork 0
+ incr robotMoreWork -1
}
proc headSave {url out} {
incr i -1
}
}
- }
+ }
+ regsub -all {~} $path {%7E} path
+ set ok 1
+ if {[info exists URL($host,robots)]} {
+ foreach l $URL($host,robots) {
+ if {[string first [lindex $l 1] $path] == 0} {
+ set ok [lindex $l 0]
+ break
+ }
+ }
+ }
set href "$method://$host$path"
- puts "Ref href = $href"
- return 1
+ puts "Ref href = $href, ok=$ok"
+ return $ok
}
-proc Robot401 {url} {
+proc RobotError {url code} {
global URL
- puts "Bad URL $url"
+ puts "Bad URL $url, $code"
set fromurl {}
- catch {
+ if {[RobotFileExist unvisited $URL($url,host) $URL($url,path)]} {
set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
set fromurl [gets $inf]
close $inf
RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
- puts $outf "URL=$url 401"
+ puts $outf "URL=$url $code"
puts $outf "Reference $fromurl"
RobotFileClose $outf
}
}
-proc Robot404 {url} {
- global URL
-
- puts "Bad URL $url"
- set fromurl {}
- catch {
- set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
- set fromurl [gets $inf]
- RobotFileClose $inf
- }
- RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
- if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
- set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
- puts $outf "URL=$url 404"
- puts $outf "Reference $fromurl"
- RobotFileClose $outf
- }
- }
-
-proc Robot301 {url tourl} {
+proc RobotRedirect {url tourl code} {
global URL
puts "Redirecting from $url to $tourl"
RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
- puts $outf "URL=$url to $tourl 301"
+ puts $outf "URL=$url to $tourl $code"
puts $outf "Reference $fromurl"
RobotFileClose $outf
}
if {![RobotFileExist unvisited $host $path]} {
puts "Mark as unvisited"
set outf [RobotFileOpen unvisited $host $path]
- puts $outf 301
+ puts $outf $code
RobotFileClose $outf
}
}
puts $out "</zmbot>"
}
+proc RobotsTxt {url} {
+ global agent URL
+
+ set v URL($URL($url,host),robots)
+ set section 0
+ foreach l [split $URL($url,buf) \n] {
+ puts $l
+ if {[regexp {([-A-Za-z]+):[ \t]*([^\#]+)} $l match cmd arg]} {
+ puts "cmd=$cmd arg=$arg"
+ switch $cmd {
+ User-Agent {
+ if {$section} break
+ set pat [string tolower $arg]*
+ set section [string match $pat $agent]
+ }
+ Disallow {
+ if {$section} {
+ puts "rule [list 0 $arg]"
+ lappend $v [list 0 $arg]
+ }
+ }
+ Allow {
+ if {$section} {
+ puts "rule [list 1 $arg]"
+ lappend $v [list 1 $arg]
+ }
+ }
+ }
+ }
+ }
+}
+
proc RobotTextPlain {url out} {
global URL
puts $out $URL($url,buf)
puts $out "</documentcontent>"
puts $out "</meta>"
+
+ if {![string compare $URL($url,path) /robots.txt]} {
+ RobotsTxt $url
+ }
}
proc Robot200 {url} {
set buffer [read $sock 16384]
set readCount [string length $buffer]
-
+
if {$readCount <= 0} {
- close $sock
Robot200 $url
- RobotRestart
+ RobotRestart $sock
+ } elseif {[string first \0 $buffer] >= 0} {
+ Robot200 $url
+ RobotRestart $sock
} else {
# puts "Got $readCount bytes"
set URL($url,buf) $URL($url,buf)$buffer
proc RobotReadHeader {url sock} {
global URL
- set buffer [read $sock 2148]
+ if {[catch {set buffer [read $sock 2148]}]} {
+ RobotError $url 404
+ RobotRestart $sock
+ }
set readCount [string length $buffer]
if {$readCount <= 0} {
- Robot404 $url
- close $sock
- RobotRestart
+ RobotError $url 404
+ RobotRestart $sock
} else {
# puts "Got $readCount bytes"
set URL($url,buf) $URL($url,buf)$buffer
set URL($url,state) skip
switch $code {
301 {
- Robot301 $url $URL($url,head,location)
- close $sock
- RobotRestart
+ RobotRedirect $url $URL($url,head,location) 301
+ RobotRestart $sock
}
302 {
- Robot301 $url $URL($url,head,location)
- close $sock
- RobotRestart
+ RobotRedirect $url $URL($url,head,location) 302
+ RobotRestart $sock
}
404 {
- Robot404 $url
- close $sock
- RobotRestart
+ RobotError $url 404
+ RobotRestart $sock
}
401 {
- Robot401 $url
- close $sock
- RobotRestart
+ RobotError $url 401
+ RobotRestart $sock
}
200 {
if {![info exists URL($url,head,content-type)]} {
fileevent $sock readable [list RobotReadContent $url $sock]
}
default {
- close $sock
Robot200 $url
- RobotRestart
+ RobotRestart $sock
}
}
}
default {
- Robot404 $url
- close $sock
- RobotRestart
+ RobotError $url 404
+ RobotRestart $sock
}
}
}
}
}
+proc RobotSockCancel {sock url} {
+
+ puts "RobotSockCancel sock=$sock url=$url"
+ RobotError $url 401
+ RobotRestart $sock
+}
+
proc RobotConnect {url sock} {
global URL agent
fconfigure $sock -translation {auto crlf} -blocking 0
- puts "Reading $url"
fileevent $sock readable [list RobotReadHeader $url $sock]
puts $sock "GET $URL($url,path) HTTP/1.0"
puts $sock "Host: $URL($url,host)"
puts $sock "User-Agent: $agent"
puts $sock ""
flush $sock
+ set URL($sock,cancel) [after 60000 [list RobotSockCancel $sock $url]]
}
proc RobotNop {} {
}
}
-
set agent "zmbot/0.0"
if {![catch {set os [exec uname -s -r]}]} {
set agent "$agent ($os)"
}
proc bgerror {m} {
+ global errorInfo
puts "BGERROR $m"
+ puts $errorInfo
}
set robotMoreWork 0
}
set domains [lindex $argv 0]
-set site [lindex $argv 1]
-if {[string length $site]} {
- set robotMoreWork 1
+foreach site [lindex $argv 1] {
+ incr robotMoreWork
if [RobotGetUrl $site {}] {
- set robotMoreWork 0
+ incr robotMoreWork -1
puts "Couldn't process $site"
}
}