#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.27 2001/11/09 13:26:50 adam Exp $
+# $Id: robot.tcl,v 1.28 2001/11/13 11:17:26 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
}
proc RobotFileNext {area} {
- global robotSeq global idletime ns
+ global robotSeq
+ global idletime ns
+ global status
# puts "RobotFileNext robotSeq=$robotSeq"
if {$robotSeq < 0} {
if {![string length $n]} {
set robotSeq -1
flush stdout
- puts "Round robin"
+ puts "Round robin un,ba,vi=$status(unvisited),$status(bad),$status(visited)"
return wait
}
incr robotSeq
}
proc RobotFileUnlink {area host path} {
+ global status
# puts "RobotFileUnlink begin"
# puts "area=$area host=$host path=$path"
set lpath [split $path /]
set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
# puts "npath=$npath"
set comp [split $npath /]
+ if {[catch {exec rm [join $comp /]}]} return
+
set l [llength $comp]
incr l -1
- if {[catch {exec rm [join $comp /]}]} return
incr l -1
+ incr status($area) -1
for {set i $l} {$i > 0} {incr i -1} {
set path [join [lrange $comp 0 $i] /]
if {![catch {glob $path/*}]} return
proc RobotFileOpen {area host path {mode w}} {
set orgPwd [pwd]
global workdir
+ global status
if {![info exists workdir]} {
return stdout
set out [open frobots.txt w]
puts "creating robots.txt in $d"
close $out
+ incr status(unvisited)
}
}
}
} else {
set out [open f $mode]
}
+ if {$mode == "w"} {
+ incr status($area)
+ }
cd $orgPwd
return $out
}
text/plain {
RobotTextPlain $url $out
}
- application/pdf {
- set pdff [open test.pdf w]
- puts -nonewline $pdff $URL($url,buf)
- close $pdff
- }
}
puts $out "</zmbot>"
}
puts -nonewline $out $URL($url,buf)
RobotFileClose $out
- if {![checkrule mime $URL($url,head,content-type)]} {
- RobotError $url mimedeny
- return
- }
set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
RobotWriteMetadata $url $out
RobotFileClose $out
if {[catch {set buffer [read $sock 2148]}]} {
RobotError $url 404
RobotRestart $url $sock
+ return
}
set readCount [string length $buffer]
if {![info exists URL($url,head,content-type)]} {
set URL($url,head,content-type) {}
}
- set binary 0
- switch $URL($url,head,content-type) {
- application/pdf {
- set binary 1
+ set binary 1
+ switch -glob -- $URL($url,head,content-type) {
+ text/* {
+ set binary 0
}
}
+ if {![regexp {/robots.txt$} $url]} {
+ if {![checkrule mime $URL($url,head,content-type)]} {
+ RobotError $url mimedeny
+ RobotRestart $url $sock
+ return
+ }
+ }
fileevent $sock readable [list RobotReadContent $url $sock $binary]
}
default {
set idletime 60000
set acceptLanguage {}
set debuglevel 0
+set status(unvisited) 0
+set status(visited) 0
+set status(bad) 0
+set status(raw) 0
# Rules: allow, deny, url
puts "max distance=$maxdistance"
puts "max jobs=$robotsMax"
+
RobotStart
while {$robotsRunning} {
vwait robotsRunning
}
+
+puts "End un,ba,vi=$status(unvisited),$status(bad),$status(visited)"