#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.34 2002/06/18 19:57:53 adam Exp $
+# $Id: robot.tcl,v 1.35 2003/06/10 11:43:52 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
set fromurl [string trim [gets $inf]]
}
-proc RobotFileNext {area} {
+proc RobotFileNext {task area} {
global robotSeq
global idletime ns
global status
- # puts "RobotFileNext robotSeq=$robotSeq"
- if {$robotSeq < 0} {
+ # puts "RobotFileNext robotSeq=$robotSeq($task)"
+ if {$robotSeq($task) < 0} {
return {}
}
- if {$robotSeq == 0} {
- if {[catch {set ns [glob ${area}/*]}]} {
- return {}
+ if {$robotSeq($task) == 0} {
+ if {[catch {set ns($task) [glob $task/$area/*]}]} {
+ return done
}
}
- set off [string length $area]
+ # puts "ns=$ns($task)"
+ set off [string length $task/$area]
incr off
- set n [lindex $ns $robotSeq]
+ set n [lindex $ns($task) $robotSeq($task)]
+ # puts "n=$n"
if {![string length $n]} {
- set robotSeq -1
+ set robotSeq($task) -1
flush stdout
- set statusfile [open status w]
- puts $statusfile "$status(unvisited) $status(bad) $status(visited)"
+ set statusfile [open $task/status w]
+ puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
close $statusfile
return wait
}
- incr robotSeq
+ incr robotSeq($task)
if {[file isfile $n/frobots.txt]} {
puts "ok returning http://[string range $n $off end]/robots.txt"
return http://[string range $n $off end]/robots.txt
}
}
puts "no more work at end of RobotFileNext n=$n"
- puts "ns=$ns"
+ puts "ns=$ns($task)"
return {}
}
-proc RobotFileExist {area host path} {
+proc RobotFileExist {task area host path} {
global debuglevel
if {$debuglevel > 3} {
incr l -1
set t [lindex $lpath $l]
incr l -1
- set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
+ set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t
if {$debuglevel > 3} {
puts "RobotFileExist end npath=$npath"
}
return [file exists $npath]
}
-proc RobotFileUnlink {area host path} {
+proc RobotFileUnlink {task area host path} {
global status
# puts "RobotFileUnlink begin"
# puts "area=$area host=$host path=$path"
incr l -1
set t [lindex $lpath $l]
incr l -1
- set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
+ set npath $task/$area/$host[join [lrange $lpath 0 $l] /d]/f$t
# puts "npath=$npath"
set comp [split $npath /]
if {[catch {exec rm [join $comp /]}]} return
set l [llength $comp]
incr l -1
incr l -1
- incr status($area) -1
+ incr status($task,$area) -1
for {set i $l} {$i > 0} {incr i -1} {
set path [join [lrange $comp 0 $i] /]
if {![catch {glob $path/*}]} return
}
}
-proc RobotFileOpen {area host path {mode w}} {
+proc RobotFileOpen {task area host path {mode w}} {
set orgPwd [pwd]
global workdir
global status
puts "pwd = $orgPwd"
exit 1
}
- set comp [split $area/$host$path /]
+
+ set comp [split $task/$area/$host /]
set len [llength $comp]
incr len -1
- for {set i 0} {$i < $len} {incr i} {
- if {$i > 1} {
- set d "d[lindex $comp $i]"
- } else {
- set d [lindex $comp $i]
- }
- if {[catch {cd ./$d}]} {
+
+ # puts "1 comp=$comp"
+
+ for {set i 0} {$i <= $len} {incr i} {
+ set d [lindex $comp $i]
+ if {[catch {cd $d}]} {
exec mkdir $d
cd ./$d
- if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
+ if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
if {[string compare $path /robots.txt]} {
set out [open frobots.txt w]
puts "creating robots.txt in $d"
close $out
- incr status(unvisited)
+ incr status($task,unvisited)
}
}
}
}
+
+ set comp [split $path /]
+ set len [llength $comp]
+ incr len -1
+
+ # puts "2 path=$path comp=$comp"
+
+ for {set i 0} {$i < $len} {incr i} {
+ set d "d[lindex $comp $i]"
+ if {[string length $d] > 1} {
+ if {[catch {cd $d}]} {
+ exec mkdir $d
+ cd ./$d
+ }
+ }
+ }
set d [lindex $comp $len]
if {[string length $d]} {
set out [open f$d $mode]
set out [open f $mode]
}
if {$mode == "w"} {
- incr status($area)
+ incr status($task,$area)
}
cd $orgPwd
return $out
}
-proc RobotRR {} {
- global robotSeq robotsRunning
+proc RobotStartJob {fname t} {
+ global control
+
+
+ set f [open $fname r]
+ set xml [read $f]
+ puts "Reading $fname"
+ regexp {<status>([^<]*)</status>} $xml x status
+ close $f
+ if {$status == "done"} {
+ puts "already done"
+ return
+ }
+ puts "status = $status"
+ if {![task $t]} {
+ return
+ }
+ htmlSwitch $xml \
+ url {
+ url $body
+ } filter {
+ set type $parm(type)
+ set action $parm(action)
+ if {$type == "domain"} {
+ $action url http://$body/*
+ }
+ if {$type == "url"} {
+ $action url $body
+ }
+ if {$type == "mime"} {
+ $action mime $body
+ }
+ } distance {
+ set control($t,distance) $body
+ } status {
+ set control($t,filestatus) $body
+ }
+ if {$status == "pending"} {
+ regsub {<status>[^<]*</status>} $xml {<status>running</status>} xml2
+ set f [open $fname w]
+ puts -nonewline $f $xml2
+ close $f
+ }
+}
+
+proc RobotDoneJob {t} {
+ global daemon_dir
+
+ if {![info exists daemon_dir]} {
+ return
+ }
+
+ set fname $t.tkl
+
+ set f [open $fname r]
+ set xml [read $f]
+ puts "Reading $fname"
+ regexp {<status>([^<]*)</status>} $xml x status
+ puts "------"
+ puts "status = $status"
+ close $f
+
+ regsub {<status>[^<]*</status>} $xml {<status>done</status>} xml2
+ set f [open $fname w]
+ puts -nonewline $f $xml2
+ close $f
+}
+
+proc RobotScanDir {} {
+ global daemon_dir
+
+ if {![info exists daemon_dir]} {
+ return
+ }
+ foreach d $daemon_dir {
+ if {[catch {set files [glob $d/*.tkl]}]} {
+ return
+ }
+ foreach fname $files {
+ if {[file isfile $fname] && [file readable $fname]} {
+ set t [file rootname $fname]
+ RobotStartJob $fname $t
+ }
+ }
+ }
+}
+
+proc RobotRR {task} {
+ global robotSeq robotsRunning tasks robotsMax status
+
+ puts "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
incr robotsRunning -1
+
+ # only one task gets through...
+ if {[string compare [lindex $tasks 0] $task]} {
+ return
+ }
+ puts "RobotRR. task = $task"
while {$robotsRunning} {
vwait robotsRunning
}
- set robotSeq 0
- RobotStart
+ puts "Scan"
+ if {[catch {RobotScanDir} msg]} {
+ puts "RobotScanDir failed"
+ puts $msg
+ }
+ foreach t $tasks {
+ set statusfile [open $t/status w]
+ puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
+ close $statusfile
+ set robotSeq($t) 0
+ RobotStart $t
+ }
+}
+
+proc RobotDaemonSig {} {
+ global daemon_cnt
+
+ incr daemon_cnt
+}
+
+proc RobotDaemonLoop {} {
+ global daemon_cnt tasks robotsRunning status
+
+ set daemon_cnt 0
+ while 1 {
+ puts $daemon_cnt
+
+ RobotScanDir
+
+ if {[info exists tasks]} {
+ puts "daemon loop tasks $tasks"
+ foreach t $tasks {
+ set robotSeq($t) 0
+ RobotStart $t
+ }
+ while {$robotsRunning} {
+ vwait robotsRunning
+ }
+ }
+ after 30000 RobotDaemonSig
+ vwait daemon_cnt
+ }
}
-proc RobotRestart {url sock} {
+proc RobotRestart {task url sock} {
global URL robotsRunning
close $sock
after cancel $URL($sock,cancel)
- foreach v [array names URL $url,*] {
+ foreach v [array names URL $task,$url,*] {
unset URL($v)
}
incr robotsRunning -1
- RobotStart
+ RobotStart $task
}
-proc RobotStart {} {
+proc RobotStart {task} {
global URL
- global robotsRunning robotsMax idletime
+ global robotsRunning robotsMax idletime status tasks
- # puts "RobotStart"
+ # puts "RobotStart $task running=$robotsRunning"
while {1} {
- set url [RobotFileNext unvisited]
+ set url [RobotFileNext $task unvisited]
+ if {[string compare $url done] == 0} {
+ puts "In RobotStart task $task done"
+
+ catch {unset ntasks}
+ foreach t $tasks {
+ if {[string compare $t $task]} {
+ lappend ntasks $t
+ } else {
+ puts "task $t done"
+ }
+ }
+ if {![info exists ntasks]} {
+ unset tasks
+ puts "all done"
+ } else {
+ set tasks $ntasks
+ }
+ RobotDoneJob $task
+ return
+ }
if {![string length $url]} {
return
}
- incr robotsRunning
+ incr robotsRunning
if {[string compare $url wait] == 0} {
- after $idletime RobotRR
- return
+ after $idletime [list RobotRR $task]
+ return
}
- set r [RobotGetUrl $url {}]
+ set r [RobotGetUrl $task $url {}]
if {!$r} {
if {$robotsRunning >= $robotsMax} return
} else {
incr robotsRunning -1
- if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
- set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
+ if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
RobotFileClose $outf
}
- RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
+ RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
}
}
}
-proc headSave {url out} {
+proc headSave {task url out} {
global URL
- if {[info exists URL($url,head,last-modified)]} {
- puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
+ if {[info exists URL($task,$url,head,last-modified)]} {
+ puts $out "<lastmodified>$URL($task,$url,head,last-modified)</lastmodified>"
}
puts $out {<si>}
- if {[info exists URL($url,head,date)]} {
- puts $out " <date>$URL($url,head,date)</date>"
+ if {[info exists URL($task,$url,head,date)]} {
+ puts $out " <date>$URL($task,$url,head,date)</date>"
}
- if {[info exists URL($url,head,content-length)]} {
- puts $out " <by>$URL($url,head,content-length)</by>"
+ if {[info exists URL($task,$url,head,content-length)]} {
+ puts $out " <by>$URL($task,$url,head,content-length)</by>"
}
- if {[info exists URL($url,head,server)]} {
- puts $out " <format>$URL($url,head,server)</format>"
+ if {[info exists URL($task,$url,head,server)]} {
+ puts $out " <format>$URL($task,$url,head,server)</format>"
}
puts $out {</si>}
puts $out {<publisher>}
puts $out " <identifier>$url</identifier>"
- if {[info exists URL($url,head,content-type)]} {
- puts $out " <type>$URL($url,head,content-type)</type>"
+ if {[info exists URL($task,$url,head,content-type)]} {
+ puts $out " <type>$URL($task,$url,head,content-type)</type>"
}
puts $out {</publisher>}
}
-proc RobotHref {url hrefx hostx pathx} {
- global URL domains debuglevel
+proc RobotHref {task url hrefx hostx pathx} {
+ global URL control debuglevel
upvar $hrefx href
upvar $hostx host
upvar $pathx path
if {![string length $surl]} {
set surl /
}
- if {[info exist domains]} {
+ if {[info exist control($task,domains)]} {
set ok 0
- foreach domain $domains {
+ foreach domain $control($task,domains) {
if {[string match $domain $host]} {
set ok 1
break
- }
+ }
}
if {!$ok} {
return 0
}
} else {
regexp {^([^\#]*)} $hpath x surl
- set host $URL($url,hostport)
+ set host $URL($task,$url,hostport)
}
if {![string length $surl]} {
return 0
}
if {[string first / $surl]} {
# relative path
- set curpath $URL($url,path)
- if {[info exists URL($url,bpath)]} {
- set curpath $URL($url,bpath)
+ set curpath $URL($task,$url,path)
+ if {[info exists URL($task,$url,bpath)]} {
+ set curpath $URL($task,$url,bpath)
}
regexp {^([^\#?]*)} $curpath x dpart
set l [string last / $dpart]
if {$debuglevel > 1} {
puts "Ref result = $href"
}
- return [checkrule url $href]
+ return [checkrule $task url $href]
}
-proc RobotError {url code} {
+proc RobotError {task url code} {
global URL
puts "Bad URL $url (code $code)"
set fromurl {}
set distance -1
- if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
- set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
+ if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
RobotReadRecord $inf fromurl distance
RobotFileClose $inf
}
- RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
- if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
- set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
+ RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
+ if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
RobotWriteRecord $outf $fromurl $distance
RobotFileClose $outf
}
}
-proc RobotRedirect {url tourl code} {
+proc RobotRedirect {task url tourl code} {
global URL
puts "Redirecting from $url to $tourl"
set distance {}
set fromurl {}
- if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
- set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
+ if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
RobotReadRecord $inf fromurl distance
RobotFileClose $inf
}
- if {![RobotFileExist bad $URL($url,hostport) $URL($url,path)]} {
- set outf [RobotFileOpen bad $URL($url,hostport) $URL($url,path)]
+ if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
RobotWriteRecord $outf $fromurl $distance
RobotFileClose $outf
}
- if {[RobotHref $url tourl host path]} {
- if {![RobotFileExist visited $host $path]} {
- if {![RobotFileExist unvisited $host $path]} {
- set outf [RobotFileOpen unvisited $host $path]
+ if {[RobotHref $task $url tourl host path]} {
+ if {![RobotFileExist $task visited $host $path]} {
+ if {![RobotFileExist $task unvisited $host $path]} {
+ set outf [RobotFileOpen $task unvisited $host $path]
RobotWriteRecord $outf $fromurl $distance
RobotFileClose $outf
}
} else {
set olddistance {}
- set inf [RobotFileOpen visited $host $path r]
+ set inf [RobotFileOpen $task visited $host $path r]
RobotReadRecord $inf oldurl olddistance
RobotFileClose $inf
if {[string length $olddistance] == 0} {
}
puts "distance=$distance olddistance=$olddistance"
if {[expr $distance < $olddistance]} {
- set outf [RobotFileOpen unvisited $host $path]
+ set outf [RobotFileOpen $task unvisited $host $path]
RobotWriteRecord $outf $tourl $distance
RobotFileClose $outf
}
}
}
- if {[catch {RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)}]} {
+ if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} {
puts "unlink failed"
exit 1
}
}
-proc link {url out href body distance} {
- global URL maxdistance
- if {[expr $distance > $maxdistance]} return
+proc link {task url out href body distance} {
+ global URL control
+ if {[expr $distance > $control($task,distance)]} return
- if {![RobotHref $url href host path]} return
+ if {![RobotHref $task $url href host path]} return
puts $out "<cr>"
puts $out "<identifier>$href</identifier>"
puts $out "<description>$body</description>"
puts $out "</cr>"
- if {![RobotFileExist visited $host $path]} {
+ if {![RobotFileExist $task visited $host $path]} {
set olddistance 1000
- if {![RobotFileExist bad $host $path]} {
- if {[RobotFileExist unvisited $host $path]} {
- set inf [RobotFileOpen unvisited $host $path r]
+ if {![RobotFileExist $task bad $host $path]} {
+ if {[RobotFileExist $task unvisited $host $path]} {
+ set inf [RobotFileOpen $task unvisited $host $path r]
RobotReadRecord $inf oldurl olddistance
RobotFileClose $inf
}
set olddistance 1000
}
if {[expr $distance < $olddistance]} {
- set outf [RobotFileOpen unvisited $host $path]
+ set outf [RobotFileOpen $task unvisited $host $path]
RobotWriteRecord $outf $url $distance
RobotFileClose $outf
}
} elseif {[string compare $href $url]} {
- set inf [RobotFileOpen visited $host $path r]
+ set inf [RobotFileOpen $task visited $host $path r]
RobotReadRecord $inf xurl olddistance
close $inf
if {[string length $olddistance] == 0} {
puts "OK remarking url=$url href=$href"
puts "olddistance = $olddistance"
puts "newdistance = $distance"
- set outf [RobotFileOpen unvisited $host $path]
+ set outf [RobotFileOpen $task unvisited $host $path]
RobotWriteRecord $outf $url $distance
RobotFileClose $outf
}
}
}
-proc RobotTextHtml {url out} {
- global URL maxdistance
+proc RobotTextHtml {task url out} {
+ global URL control
# set title so we can emit it for the body
set title {}
set distance 0
set fdistance 0
- if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
- set fdistance $URL($url,dist)
+ if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} {
+ set fdistance $URL($task,$url,dist)
set distance [expr $fdistance + 1]
}
- htmlSwitch $URL($url,buf) \
+ htmlSwitch $URL($task,$url,buf) \
title {
set title $body
} -nonest meta {
continue
}
set href [string trim $parm(href)]
- if {![RobotHref $url href host path]} continue
- set URL($url,bpath) $path
+ if {![RobotHref $task $url href host path]} continue
+ set URL($task,$url,bpath) $path
} a {
# <a href="...."> .. </a>
# we're not using nonest - otherwise body isn't set
if {![info exists parm(href)]} {
continue
}
- link $url $out [string trim $parm(href)] $body $distance
+ link $task $url $out [string trim $parm(href)] $body $distance
} -nonest area {
if {$nofollow} continue
if {![info exists parm(href)]} {
continue
}
- link $url $out [string trim $parm(href)] $body $distance
+ link $task $url $out [string trim $parm(href)] $body $distance
} -nonest frame {
if {![info exists parm(src)]} {
continue
}
- link $url $out [string trim $parm(src)] $body $fdistance
+ link $task $url $out [string trim $parm(src)] $body $fdistance
}
}
-proc RobotsTxt {url} {
+proc RobotsTxt {task url} {
global agent URL
- RobotsTxt0 URL(URL($url,hostport),robots) $URL($url,buf)
+ RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf)
}
-proc RobotsTxt0 {v buf} {
+proc RobotsTxt0 {task v buf} {
global URL agent
set section 0
foreach l [split $buf \n] {
if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
+ set arg [string trim $arg]
puts "cmd=$cmd arg=$arg"
switch -- [string tolower $cmd] {
user-agent {
}
}
-proc RobotTextPlain {url out} {
+proc RobotTextPlain {task url out} {
global URL
puts $out "<documentcontent>"
- regsub -all {<} $URL($url,buf) {\<} content
+ regsub -all {<} $URL($task,$url,buf) {\<} content
puts $out $content
puts $out "</documentcontent>"
- if {![string compare $URL($url,path) /robots.txt]} {
- RobotsTxt $url
+ if {![string compare $URL($task,$url,path) /robots.txt]} {
+ RobotsTxt $task $url
}
}
-proc RobotWriteMetadata {url out} {
- global URL domains
+proc RobotWriteMetadata {task url out} {
+ global URL
puts $out "<zmbot>"
set distance 1000
- if {[RobotFileExist unvisited $URL($url,hostport) $URL($url,path)]} {
- set inf [RobotFileOpen unvisited $URL($url,hostport) $URL($url,path) r]
+ if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
RobotReadRecord $inf fromurl distance
RobotFileClose $inf
}
- set URL($url,dist) $distance
+ set URL($task,$url,dist) $distance
puts $out "<distance>"
puts $out " $distance"
puts $out "</distance>"
- headSave $url $out
+ headSave $task $url $out
puts "Parsing $url distance=$distance"
- switch $URL($url,head,content-type) {
+ switch $URL($task,$url,head,content-type) {
text/html {
if {[string length $distance]} {
- RobotTextHtml $url $out
+ RobotTextHtml $task $url $out
}
}
text/plain {
- RobotTextPlain $url $out
+ RobotTextPlain $task $url $out
}
}
puts $out "</zmbot>"
}
-proc Robot200 {url} {
- global URL domains
+proc Robot200 {task url} {
+ global URL
- set out [RobotFileOpen raw $URL($url,hostport) $URL($url,path)]
- puts -nonewline $out $URL($url,buf)
+ set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)]
+ puts -nonewline $out $URL($task,$url,buf)
RobotFileClose $out
- set out [RobotFileOpen visited $URL($url,hostport) $URL($url,path)]
- RobotWriteMetadata $url $out
+ set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)]
+ RobotWriteMetadata $task $url $out
RobotFileClose $out
- RobotFileUnlink unvisited $URL($url,hostport) $URL($url,path)
+ RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
}
-proc RobotReadContent {url sock binary} {
+proc RobotReadContent {task url sock binary} {
global URL
set buffer [read $sock 16384]
set readCount [string length $buffer]
if {$readCount <= 0} {
- Robot200 $url
- RobotRestart $url $sock
+ Robot200 $task $url
+ RobotRestart $task $url $sock
} elseif {!$binary && [string first \0 $buffer] >= 0} {
- Robot200 $url
- RobotRestart $url $sock
+ Robot200 $task $url
+ RobotRestart $task $url $sock
} else {
# puts "Got $readCount bytes"
- set URL($url,buf) $URL($url,buf)$buffer
+ set URL($task,$url,buf) $URL($task,$url,buf)$buffer
}
}
-proc RobotReadHeader {url sock} {
+proc RobotReadHeader {task url sock} {
global URL debuglevel
if {$debuglevel > 1} {
puts "HTTP head $url"
}
if {[catch {set buffer [read $sock 2148]}]} {
- RobotError $url 404
- RobotRestart $url $sock
+ RobotError $task $url 404
+ RobotRestart $task $url $sock
return
}
set readCount [string length $buffer]
if {$readCount <= 0} {
- RobotError $url 404
- RobotRestart $url $sock
+ RobotError $task $url 404
+ RobotRestart $task $url $sock
} else {
# puts "Got $readCount bytes"
- set URL($url,buf) $URL($url,buf)$buffer
+ set URL($task,$url,buf) $URL($task,$url,buf)$buffer
- set n [string first \r\n\r\n $URL($url,buf)]
+ set n [string first \r\n\r\n $URL($task,$url,buf)]
if {$n > 1} {
set code 0
set version {}
- set headbuf [string range $URL($url,buf) 0 $n]
+ set headbuf [string range $URL($task,$url,buf) 0 $n]
incr n 4
- set URL($url,buf) [string range $URL($url,buf) $n end]
+ set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end]
regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
set lines [split $headbuf \n]
foreach line $lines {
if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
- set URL($url,head,[string tolower $name]) [string trim $value]
+ set URL($task,$url,head,[string tolower $name]) [string trim $value]
}
}
puts "HTTP CODE $code"
- set URL($url,state) skip
+ set URL($task,$url,state) skip
switch $code {
301 {
- RobotRedirect $url $URL($url,head,location) 301
- RobotRestart $url $sock
+ RobotRedirect $task $url $URL($task,$url,head,location) 301
+ RobotRestart $task $url $sock
}
302 {
- RobotRedirect $url $URL($url,head,location) 302
- RobotRestart $url $sock
+ RobotRedirect $task $url $URL($task,$url,head,location) 302
+ RobotRestart $task $url $sock
}
200 {
- if {![info exists URL($url,head,content-type)]} {
- set URL($url,head,content-type) {}
+ if {![info exists URL($task,$url,head,content-type)]} {
+ set URL($task,$url,head,content-type) {}
}
set binary 1
- switch -glob -- $URL($url,head,content-type) {
+ switch -glob -- $URL($task,$url,head,content-type) {
text/* {
set binary 0
}
}
if {![regexp {/robots.txt$} $url]} {
- if {![checkrule mime $URL($url,head,content-type)]} {
- RobotError $url mimedeny
- RobotRestart $url $sock
+ if {![checkrule $task mime $URL($task,$url,head,content-type)]} {
+ RobotError $task $url mimedeny
+ RobotRestart $task $url $sock
return
}
}
- fileevent $sock readable [list RobotReadContent $url $sock $binary]
+ fileevent $sock readable [list RobotReadContent $task $url $sock $binary]
}
default {
- RobotError $url $code
- RobotRestart $url $sock
+ RobotError $task $url $code
+ RobotRestart $task $url $sock
}
}
}
}
}
-proc RobotSockCancel {url sock} {
+proc RobotSockCancel {task url sock} {
puts "RobotSockCancel sock=$sock url=$url"
- RobotError $url 401
- RobotRestart $url $sock
+ RobotError $task $url 401
+ RobotRestart $task $url $sock
}
-proc RobotConnect {url sock} {
+proc RobotConnect {task url sock} {
global URL agent acceptLanguage
fconfigure $sock -translation {lf crlf} -blocking 0
- fileevent $sock readable [list RobotReadHeader $url $sock]
- puts $sock "GET $URL($url,path) HTTP/1.0"
- puts $sock "Host: $URL($url,host)"
+ fileevent $sock readable [list RobotReadHeader $task $url $sock]
+ puts $sock "GET $URL($task,$url,path) HTTP/1.0"
+ puts $sock "Host: $URL($task,$url,host)"
puts $sock "User-Agent: $agent"
if {[string length $acceptLanguage]} {
puts $sock "Accept-Language: $acceptLanguage"
}
puts $sock ""
- flush $sock
- set URL($sock,cancel) [after 30000 [list RobotSockCancel $url $sock]]
+ set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]]
+ if {[catch {flush $sock}]} {
+ RobotError $task $url 404
+ RobotRestart $task $url $sock
+ }
}
proc RobotNop {} {
}
-proc RobotGetUrl {url phost} {
+proc RobotGetUrl {task url phost} {
global URL robotsRunning
flush stdout
- puts "Retrieve $robotsRunning url=$url"
+ puts "Retrieve running=$robotsRunning url=$url task=$task"
if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
return -1
}
set port 80
set host $hostport
}
- set URL($url,method) $method
- set URL($url,host) $host
- set URL($url,hostport) $hostport
- set URL($url,path) $path
- set URL($url,state) head
- set URL($url,buf) {}
+ set URL($task,$url,method) $method
+ set URL($task,$url,host) $host
+ set URL($task,$url,hostport) $hostport
+ set URL($task,$url,path) $path
+ set URL($task,$url,state) head
+ set URL($task,$url,buf) {}
if {[string compare $path /robots.txt]} {
set ok 1
if {![info exists URL($hostport,robots)]} {
puts "READING robots.txt for host $hostport"
- if {[RobotFileExist visited $hostport /robots.txt]} {
- set inf [RobotFileOpen visited $hostport /robots.txt r]
+ if {[RobotFileExist $task visited $hostport /robots.txt]} {
+ set inf [RobotFileOpen $task visited $hostport /robots.txt r]
set buf [read $inf 32768]
close $inf
} else {
set buf "User-agent: *\nAllow: /\n"
}
- RobotsTxt0 URL($hostport,robots) $buf
+ RobotsTxt0 $task URL($hostport,robots) $buf
}
if {[info exists URL($hostport,robots)]} {
foreach l $URL($hostport,robots) {
if [catch {set sock [socket -async $host $port]}] {
return -1
}
- RobotConnect $url $sock
+ RobotConnect $task $url $sock
return 0
}
}
}
-set agent "zmbot/0.1"
+set agent "zmbot/0.2"
if {![catch {set os [exec uname -s -r]}]} {
set agent "$agent ($os)"
}
}
set robotsRunning 0
-set robotSeq 0
set workdir [pwd]
-set idletime 60000
+set idletime 30000
set acceptLanguage {}
set debuglevel 0
-set status(unvisited) 0
-set status(visited) 0
-set status(bad) 0
-set status(raw) 0
-
# Rules: allow, deny, url
-proc checkrule {type this} {
- global alrules
+proc checkrule {task type this} {
+ global control
global debuglevel
if {$debuglevel > 3} {
puts "CHECKRULE $type $this"
}
- if {[info exist alrules]} {
- foreach l $alrules {
+ if {[info exist control($task,alrules)]} {
+ foreach l $control($task,alrules) {
if {$debuglevel > 3} {
puts "consider $l"
}
proc url {href} {
- global debuglevel
+ global debuglevel task
- if {[RobotHref http://www.indexdata.dk/ href host path]} {
- if {![RobotFileExist visited $host $path]} {
- set outf [RobotFileOpen unvisited $host $path]
+ if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
+ if {![RobotFileExist $task visited $host $path]} {
+ set outf [RobotFileOpen $task unvisited $host $path]
RobotWriteRecord $outf href 0
RobotFileClose $outf
}
}
proc deny {type stuff} {
- global alrules
+ global control task
- lappend alrules [list deny $type $stuff]
+ lappend control($task,alrules) [list deny $type $stuff]
}
proc allow {type stuff} {
- global alrules
+ global control task
- lappend alrules [list allow $type $stuff]
+ lappend control($task,alrules) [list allow $type $stuff]
}
proc debug {level} {
set debuglevel $level
}
+proc task {t} {
+ global tasks task status robotSeq control
+
+ set task $t
+
+ if {[info exists tasks]} {
+ if {[lsearch -exact $tasks $t] >= 0} {
+ return 0
+ }
+ }
+
+ lappend tasks $t
+ set status($t,unvisited) 0
+ set status($t,visited) 0
+ set status($t,bad) 0
+ set status($t,raw) 0
+ set status($t,active) 1
+ set robotSeq($t) 0
+ set control($t,distance) 10
+ return 1
+}
+
+# Little utility that ensures that at least one task is present (main).
+proc chktask {} {
+ global tasks
+ if {![info exist tasks]} {
+ task main
+ }
+}
+
+
# Parse options
set i 0
exit 1
}
+
+
while {$i < $l} {
set arg [lindex $argv $i]
switch -glob -- $arg {
+ -t* {
+ set t [string range $arg 2 end]
+ if {![string length $t]} {
+ set t [lindex $argv [incr i]]
+ }
+ task $t
+ }
+ -D* {
+ set dir [string range $arg 2 end]
+ if {![string length $dir]} {
+ set dir [lindex $argv [incr i]]
+ }
+ lappend daemon_dir $dir
+ }
-j* {
set robotsMax [string range $arg 2 end]
if {![string length $robotsMax]} {
}
}
-c* {
- set maxdistance [string range $arg 2 end]
- if {![string length $maxdistance]} {
- set maxdistance [lindex $argv [incr i]]
+ chktask
+ set control($task,distance) [string range $arg 2 end]
+ if {![string length $control($task,distance)]} {
+ set control($task,distance) [lindex $argv [incr i]]
}
}
-d* {
+ chktask
set dom [string range $arg 2 end]
if {![string length $dom]} {
set dom [lindex $argv [incr i]]
}
- lappend domains $dom
+ lappend control($task,domains) $dom
}
-i* {
set idletime [string range $arg 2 end]
}
}
-l* {
+ chktask
set acceptLanguage [string range $arg 2 end]
if {![string length $acceptLanguage]} {
set acceptLanguage [lindex $argv [incr i]]
}
}
-r* {
+ chktask
set rfile [string range $arg 2 end]
if {![string length $rfile]} {
set rfile [lindex $argv [incr i]]
}
+ catch {unset maxdistance}
source $rfile
+ if {[info exists maxdistance]} {
+ set control($task,distance) $maxdistance
+ }
}
default {
+ chktask
set href $arg
- if {[RobotHref http://www.indexdata.dk/ href host path]} {
- if {![RobotFileExist visited $host $path]} {
- set outf [RobotFileOpen unvisited $host $path]
+ if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
+ if {![RobotFileExist $task visited $host $path]} {
+ set outf [RobotFileOpen $task unvisited $host $path]
RobotWriteRecord $outf href 0
RobotFileClose $outf
}
incr i
}
-if {![info exist domains]} {
- set domains {*}
-}
-if {![info exist maxdistance]} {
- set maxdistance 50
-}
if {![info exist robotsMax]} {
set robotsMax 5
}
-puts "domains=$domains"
-puts "max distance=$maxdistance"
-puts "max jobs=$robotsMax"
-
-
-RobotStart
-
-
-while {$robotsRunning} {
- vwait robotsRunning
+if {[info exist daemon_dir]} {
+ RobotDaemonLoop
+} else {
+ foreach t $tasks {
+ puts "task $t"
+ puts "max distance=$control($t,distance)"
+ if {[info exists control($t,domains)]} {
+ puts "domains=$control($t,domains)"
+ }
+ }
+ puts "max jobs=$robotsMax"
+
+ foreach t $tasks {
+ RobotStart $t
+ }
+
+ while {$robotsRunning} {
+ vwait robotsRunning
+ }
+
+ if {[info exists tasks]} {
+ foreach t $tasks {
+ set statusfile [open $t/status w]
+ puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
+ close $statusfile
+ }
+ }
}
-set statusfile [open status w]
-puts $statusfile "$status(unvisited) $status(bad) $status(visited)"
-close $statusfile
-