--- /dev/null
+#!/usr/bin/tclsh
+# $Id: tkl-web-harvester,v 1.1 2003/08/14 08:02:10 marc Exp $
+#
+set loghandle stdout
+set robotsRunning 0
+set workdir [pwd]
+set idletime 15000
+set acceptLanguage {}
+set debuglevel 1
+set libdir ""
+
+
+proc logmsg {msg} {
+ global loghandle
+
+ puts $loghandle $msg
+ flush $loghandle
+}
+
+#proc dbgmsg {level msg} {
+# global debuglevel
+# if {[expr $debuglevel >= $level]} {
+# logmsg $msg
+# }
+#}
+proc dbgmsg {msg} {
+ global debuglevel
+ if {[expr $debuglevel >= 0]} {
+ logmsg $msg
+ }
+}
+# dbgmsg is always called with just one string!
+
+
+proc fnameEncode {fname} {
+ regsub -all {&} $fname {%38} fname
+ regsub -all {<} $fname {%3C} fname
+ regsub -all {>} $fname {%3E} fname
+ regsub -all {\?} $fname {%3F} fname
+ regsub -all {\*} $fname {%2A} fname
+ return $fname
+}
+
+proc fnameDecode {fname} {
+ regsub -all {%38} $fname {&} fname
+ regsub -all {%3C} $fname {<} fname
+ regsub -all {%3E} $fname {>} fname
+ regsub -all {%3F} $fname {?} fname
+ regsub -all {%2A} $fname {*} fname
+ return $fname
+}
+
+proc RobotFileNext1 {area lead} {
+ # dbgmsg "RobotFileNext1 area=$area lead=$lead"
+ if {[catch {set ns [glob ${area}/*]}]} {
+ return {}
+ }
+ foreach n $ns {
+ if {[file isfile $n]} {
+ set off [string last / $n]
+ # skip /
+ incr off
+ set end [string length $n]
+ # skip _.tkl
+ incr end -6
+ return $lead/[string range $n $off $end]
+ }
+ }
+ foreach n $ns {
+ if {[file isdirectory $n]} {
+ set off [string last / $n]
+ # skip /
+ incr off
+ set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
+ if {[string length $sb]} {
+ return $sb
+ }
+ }
+ }
+ return {}
+}
+
+proc RobotWriteRecord {outf fromurl distance} {
+ puts $outf {<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>}
+ puts $outf "<zmbot>"
+ puts $outf "<distance>"
+ puts $outf $distance
+ puts $outf "</distance>"
+ puts $outf "<fromurl>"
+ puts $outf $fromurl
+ puts $outf "</fromurl>"
+ puts $outf "</zmbot>"
+}
+
+proc RobotReadRecord {inf fromurlx distancex} {
+ upvar $fromurlx fromurl
+ upvar $distancex distance
+ gets $inf
+ gets $inf
+ gets $inf
+ set distance [string trim [gets $inf]]
+ # dbgmsg "got distance = $distance"
+ gets $inf
+ gets $inf
+ set fromurl [string trim [gets $inf]]
+}
+
+proc RobotFileNext {task area} {
+ global control
+ global idletime ns
+ global status
+
+ # dbgmsg "RobotFileNext seq=$control($task,seq)"
+ if {$control($task,seq) < 0} {
+ return {}
+ }
+ set target $control($task,target)
+ if {$control($task,seq) == 0} {
+ if {[catch {set ns($task) [glob $target/$area/*]}]} {
+ puts "----------- DONE-------- target=$target"
+ return done
+ }
+ }
+ # dbgmsg "ns=$ns($task)"
+ set off [string length $target/$area]
+ incr off
+ set n [lindex $ns($task) $control($task,seq)]
+ # dbgmsg "n=$n"
+ if {![string length $n]} {
+ set control($task,seq) -1
+ set statusfile [open $target/status w]
+ puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
+ close $statusfile
+ return wait
+ }
+ incr control($task,seq)
+ if {[file isfile $n/robots.txt_.tkl]} {
+ # dbgmsg "ok returning http://[string range $n $off end]/robots.txt"
+ return [fnameDecode http://[string range $n $off end]/robots.txt]
+ } elseif {[file isdirectory $n]} {
+ set sb [RobotFileNext1 $n http://[string range $n $off end]]
+ if {[string length $sb]} {
+ return [fnameDecode $sb]
+ }
+ }
+ dbgmsg "no more work at end of RobotFileNext n=$n"
+ dbgmsg "ns=$ns($task)"
+ return {}
+}
+
+
+proc RobotFileExist {task area host path} {
+ global debuglevel control
+
+ if {$debuglevel > 3} {
+ dbgmsg "RobotFileExist begin area=$area host=$host path=$path"
+ }
+ set target $control($task,target)
+ return [file exists [fnameEncode $target/$area/$host${path}_.tkl]]
+}
+
+proc RobotFileUnlink {task area host path} {
+ global status control
+
+ set target $control($task,target)
+ # dbgmsg "RobotFileUnlink begin"
+ # dbgmsg "area=$area host=$host path=$path"
+ set npath [fnameEncode $target/$area/$host${path}_.tkl]
+ # dbgmsg "npath=$npath"
+ set comp [split $npath /]
+ if {[catch {exec rm $npath}]} return
+
+ set l [llength $comp]
+ incr l -2
+ incr status($task,$area) -1
+ for {set i $l} {$i > 0} {incr i -1} {
+ set path [join [lrange $comp 0 $i] /]
+ if {![catch {glob $path/*}]} return
+ exec rmdir $path
+ }
+ # dbgmsg "RobotFileUnlink end"
+}
+
+proc RobotFileClose {out} {
+ if [string compare $out stdout] {
+ close $out
+ }
+}
+
+proc RobotFileOpen {task area host path {mode w}} {
+ set orgPwd [pwd]
+ global workdir status debuglevel control
+
+ # dbgmsg "RobotFileOpen task=$task path=$path"
+
+ set target $control($task,target)
+ set path [fnameEncode $path]
+
+ if {![info exists workdir]} {
+ return stdout
+ }
+ if {$debuglevel > 3} {
+ dbgmsg "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+ }
+ if {[string compare $orgPwd $workdir]} {
+ dbgmsg "ooops. RobotFileOpen failed"
+ dbgmsg "workdir = $workdir"
+ dbgmsg "pwd = $orgPwd"
+ exit 1
+ }
+
+ set comp [split $target/$area/$host /]
+ set len [llength $comp]
+ incr len -1
+
+ # dbgmsg "1 comp=$comp"
+
+ for {set i 0} {$i <= $len} {incr i} {
+ set d [lindex $comp $i]
+ if {[string length $d] == 0} {
+ cd /
+ } elseif {[catch {cd $d}]} {
+ exec mkdir $d
+ cd ./$d
+ if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
+ if {[string compare $path /robots.txt]} {
+ set out [open robots.txt_.tkl w]
+ dbgmsg "creating robots.txt in $d"
+ close $out
+ incr status($task,unvisited)
+ }
+ }
+ }
+ }
+
+ set comp [split $path /]
+ set len [llength $comp]
+ incr len -1
+
+ # dbgmsg "2 path=$path comp=$comp"
+
+ for {set i 0} {$i < $len} {incr i} {
+ set d [lindex $comp $i]
+ if {[string length $d] > 0} {
+ if {[catch {cd $d}]} {
+ exec mkdir $d
+ cd ./$d
+ }
+ }
+ }
+ set d [lindex $comp $len]
+ set out [open ${d}_.tkl $mode]
+ if {$mode == "w"} {
+ incr status($task,$area)
+ }
+ cd $orgPwd
+ return $out
+}
+
+proc RobotStartJob {root task} {
+ global control
+
+ set fname "$root$task"
+ set f [open $fname r]
+ set xml [read $f]
+ dbgmsg "Reading $fname"
+ close $f
+ # task type must be 2
+ if {![regexp {<tasktype>([^<]*)</tasktype>} $xml x tasktype]} {
+ return
+ }
+ set tasktype [string trim $tasktype]
+ if {![string match 2 $tasktype]} {
+ return
+ }
+ # status must not be finished or error
+ if {![regexp {<status>([^<]*)</status>} $xml x status]} {
+ return
+ }
+ if {$status == "finished"} {
+ dbgmsg "already finished"
+ return
+ }
+ if {$status == "error"} {
+ dbgmsg "already finished due to error"
+ return
+ }
+ # ignore if task has already been processed
+ dbgmsg "status = $status"
+ if {![CreateTask $task]} {
+ return
+ }
+ set control($task,taskfname) $fname
+ dbgmsg "Reading $fname stage 2"
+ htmlSwitch $xml \
+ url {
+ lappend starturls $body
+ } filter {
+ set type $parm(type)
+ set action $parm(action)
+ if {$type == "domain"} {
+ $action url http://$body/*
+ }
+ if {$type == "url"} {
+ $action url $body
+ }
+ if {$type == "mime"} {
+ $action mime $body
+ }
+ } target {
+ set ex [file rootname [file tail $task]]
+ #set control($task,target) "$root$body/$ex"
+ set control($task,target) "$control(tmpdir)/$ex"
+ set control($task,output) "$root$body"
+ } distance {
+ set control($task,distance) $body
+ } status {
+ set control($task,filestatus) $body
+ } tasktype {
+ set control($task,tasktype) $body
+ }
+
+ if {[info exists starturls]} {
+ foreach url $starturls {
+ puts "marking start urls $url"
+ url $url
+ }
+ }
+
+ if {$status == "pending"} {
+ regsub {<status>[^<]*</status>} $xml {<status>running</status>} xml2
+ set f [open $fname w]
+ puts -nonewline $f $xml2
+ close $f
+ }
+}
+
+proc RobotDoneJob {task} {
+ global daemon_dir control
+
+ if {![info exists daemon_dir]} {
+ return
+ }
+ set fname $control($task,taskfname)
+ set f [open $fname r]
+ set xml [read $f]
+ dbgmsg "Reading $fname"
+ regexp {<status>([^<]*)</status>} $xml x status
+ dbgmsg "------"
+ dbgmsg "status = $status"
+ close $f
+
+ regsub {<status>[^<]*</status>} $xml {<status>finished</status>} xml2
+ set f [open $fname w]
+ puts -nonewline $f $xml2
+ close $f
+}
+
+proc RobotScanDir {} {
+ global daemon_dir
+
+ if {![info exists daemon_dir]} {
+ return
+ }
+ foreach d $daemon_dir {
+ if {[catch {set files [glob $d/*.spl]}]} {
+ return
+ }
+ foreach fname $files {
+ if {[file isfile $fname] && [file readable $fname]} {
+ set jobfile [open $fname]
+ gets $jobfile portalroot
+ gets $jobfile portaltask
+ close $jobfile
+
+ RobotStartJob $portalroot $portaltask
+ }
+ }
+ }
+}
+
+proc RobotRR {task} {
+ global control robotsRunning tasks robotsMax status
+
+ dbgmsg "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
+ incr robotsRunning -1
+
+ # only one task gets through...
+ if {[string compare [lindex $tasks 0] $task]} {
+ return
+ }
+ dbgmsg "RobotRR. task = $task"
+ while {$robotsRunning} {
+ vwait robotsRunning
+ }
+ dbgmsg "Scan"
+ if {[catch {RobotScanDir} msg]} {
+ logmsg "RobotScanDir failed"
+ logmsg $msg
+ }
+ foreach t $tasks {
+ set target $control($t,target)
+ set statusfile [open $target/status w]
+ puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
+ close $statusfile
+ set control($t,seq) 0
+ RobotStart $t
+ }
+}
+
+proc RobotDaemonSig {} {
+ global daemon_cnt
+
+ incr daemon_cnt
+}
+
+proc RobotDaemonLoop {} {
+ global daemon_cnt tasks robotsRunning status
+
+ set daemon_cnt 0
+ while 1 {
+ logmsg $daemon_cnt
+ RobotScanDir
+
+ if {[info exists tasks]} {
+ logmsg "daemon loop tasks $tasks"
+ foreach t $tasks {
+ set control($t,seq) 0
+ RobotStart $t
+ }
+ while {$robotsRunning} {
+ vwait robotsRunning
+ }
+ }
+ after 30000 RobotDaemonSig
+ vwait daemon_cnt
+ }
+}
+
+proc RobotRestart {task url sock} {
+ global URL robotsRunning
+
+ close $sock
+ after cancel $URL($sock,cancel)
+
+ foreach v [array names URL $task,$url,*] {
+ unset URL($v)
+ }
+
+ incr robotsRunning -1
+ RobotStart $task
+}
+
+proc RobotStart {task} {
+ global URL
+ global robotsRunning robotsMax idletime status tasks
+
+ # dbgmsg "RobotStart $task running=$robotsRunning"
+ while {1} {
+ set url [RobotFileNext $task unvisited]
+ if {[string compare $url done] == 0} {
+ dbgmsg "In RobotStart task $task done"
+
+ catch {unset ntasks}
+ foreach t $tasks {
+ if {[string compare $t $task]} {
+ lappend ntasks $t
+ } else {
+ dbgmsg "task $t done"
+ }
+ }
+ if {![info exists ntasks]} {
+ unset tasks
+ dbgmsg "all done"
+ } else {
+ set tasks $ntasks
+ }
+ RobotDoneJob $task
+ return
+ }
+ if {![string length $url]} {
+ return
+ }
+ incr robotsRunning
+ if {[string compare $url wait] == 0} {
+ after $idletime [list RobotRR $task]
+ return
+ }
+ set r [RobotGetUrl $task $url {}]
+ if {!$r} {
+ if {$robotsRunning >= $robotsMax} return
+ } else {
+ incr robotsRunning -1
+ if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
+ RobotFileClose $outf
+ }
+ RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
+ }
+ }
+}
+
+proc headSave {task url out} {
+ global URL
+
+ if {[info exists URL($task,$url,head,last-modified)]} {
+ puts $out "<lastmodified>$URL($task,$url,head,last-modified)</lastmodified>"
+ }
+ puts $out {<si>}
+ if {[info exists URL($task,$url,head,date)]} {
+ puts $out " <date>$URL($task,$url,head,date)</date>"
+ }
+ if {[info exists URL($task,$url,head,content-length)]} {
+ puts $out " <by>$URL($task,$url,head,content-length)</by>"
+ }
+ if {[info exists URL($task,$url,head,server)]} {
+ puts $out " <format>$URL($task,$url,head,server)</format>"
+ }
+ puts $out {</si>}
+ puts $out {<publisher>}
+ puts $out " <identifier>$url</identifier>"
+ if {[info exists URL($task,$url,head,content-type)]} {
+ puts $out " <type>$URL($task,$url,head,content-type)</type>"
+ }
+ puts $out {</publisher>}
+}
+
+proc RobotHref {task url hrefx hostx pathx} {
+ global URL control debuglevel
+ upvar $hrefx href
+ upvar $hostx host
+ upvar $pathx path
+
+ if {$debuglevel > 1} {
+ dbgmsg "Ref input url = $url href=$href"
+ }
+
+ if {[string first { } $href] >= 0} {
+ return 0
+ }
+ if {[string length $href] > 256} {
+ return 0
+ }
+
+# Skip pages that have ? in them
+# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+# return 0
+# }
+ # get method (if any)
+ if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
+ set hpath $href
+ set method http
+ } else {
+ if {[string compare $method http]} {
+ return 0
+ }
+ }
+ # get host (if any)
+ if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
+ if {![string length $surl]} {
+ set surl /
+ }
+ if {[info exist control($task,domains)]} {
+ set ok 0
+ foreach domain $control($task,domains) {
+ if {[string match $domain $host]} {
+ set ok 1
+ break
+ }
+ }
+ if {!$ok} {
+ return 0
+ }
+ }
+ } else {
+ regexp {^([^\#]*)} $hpath x surl
+ set host $URL($task,$url,hostport)
+ }
+ if {![string length $surl]} {
+ return 0
+ }
+ if {[string first / $surl]} {
+ # relative path
+ set curpath $URL($task,$url,path)
+ if {[info exists URL($task,$url,bpath)]} {
+ set curpath $URL($task,$url,bpath)
+ }
+ regexp {^([^\#?]*)} $curpath x dpart
+ set l [string last / $dpart]
+ if {[expr $l >= 0]} {
+ set surl [string range $dpart 0 $l]$surl
+ } else {
+ set surl $dpart/$surl
+ }
+ }
+ set surllist [split $surl /]
+ catch {unset path}
+ set pathl 0
+ foreach c $surllist {
+ switch -- $c {
+ .. {
+ if {$pathl > 1} {
+ incr pathl -2
+ set path [lrange $path 0 $pathl]
+ incr pathl
+ }
+ }
+ . {
+
+ }
+ default {
+ incr pathl
+ lappend path $c
+ }
+ }
+ }
+ if {$debuglevel > 4} {
+ dbgmsg "pathl=$pathl output path=$path"
+ }
+ set path [join $path /]
+ if {![string length $path]} {
+ set path /
+ }
+ regsub -all {~} $path {%7E} path
+ set href "$method://$host$path"
+
+ if {$debuglevel > 1} {
+ dbgmsg "Ref result = $href"
+ }
+ return [checkrule $task url $href]
+}
+
+proc RobotError {task url code} {
+ global URL
+
+ dbgmsg "Bad URL $url (code $code)"
+ set fromurl {}
+ set distance -1
+ if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
+ RobotReadRecord $inf fromurl distance
+ RobotFileClose $inf
+ }
+ RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
+ if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
+ RobotWriteRecord $outf $fromurl $distance
+ RobotFileClose $outf
+ }
+}
+
+proc RobotRedirect {task url tourl code} {
+ global URL
+
+ dbgmsg "Redirecting from $url to $tourl"
+
+ set distance {}
+ set fromurl {}
+ if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
+ RobotReadRecord $inf fromurl distance
+ RobotFileClose $inf
+ }
+ if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
+ RobotWriteRecord $outf $fromurl $distance
+ RobotFileClose $outf
+ }
+ if {[RobotHref $task $url tourl host path]} {
+ if {![RobotFileExist $task visited $host $path]} {
+ if {![RobotFileExist $task unvisited $host $path]} {
+ set outf [RobotFileOpen $task unvisited $host $path]
+ RobotWriteRecord $outf $fromurl $distance
+ RobotFileClose $outf
+ }
+ } else {
+ set olddistance {}
+ set inf [RobotFileOpen $task visited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[string length $distance] == 0} {
+ set distance 1000
+ }
+ dbgmsg "distance=$distance olddistance=$olddistance"
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen $task unvisited $host $path]
+ RobotWriteRecord $outf $tourl $distance
+ RobotFileClose $outf
+ }
+ }
+ }
+ if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} {
+ dbgmsg "unlink failed"
+ exit 1
+ }
+}
+
+proc wellform {body} {
+ regsub -all {<!--[^-]*-->} $body { } abody
+ regsub -all -nocase {<script[^<]*</script>} $abody {} body
+ regsub -all {<[^\>]+>} $body {} abody
+ regsub -all { } $abody { } body
+ regsub -all {&} $body {&} abody
+ return $abody
+}
+
+proc link {task url out href body distance} {
+ global URL control
+ if {[expr $distance > $control($task,distance)]} return
+
+ if {![RobotHref $task $url href host path]} return
+
+ if ($control($task,cr)) {
+ puts $out "<cr>"
+ puts $out "<identifier>$href</identifier>"
+ set abody [wellform $body]
+ puts $out "<description>$abody</description>"
+ puts $out "</cr>"
+ }
+
+ if {![RobotFileExist $task visited $host $path]} {
+ set olddistance 1000
+ if {![RobotFileExist $task bad $host $path]} {
+ if {[RobotFileExist $task unvisited $host $path]} {
+ set inf [RobotFileOpen $task unvisited $host $path r]
+ RobotReadRecord $inf oldurl olddistance
+ RobotFileClose $inf
+ }
+ } else {
+ set olddistance 0
+ }
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ set outf [RobotFileOpen $task unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ } elseif {[string compare $href $url]} {
+ set inf [RobotFileOpen $task visited $host $path r]
+ RobotReadRecord $inf xurl olddistance
+ close $inf
+ if {[string length $olddistance] == 0} {
+ set olddistance 1000
+ }
+ if {[expr $distance < $olddistance]} {
+ dbgmsg "OK remarking url=$url href=$href"
+ dbgmsg "olddistance = $olddistance"
+ dbgmsg "newdistance = $distance"
+ set outf [RobotFileOpen $task unvisited $host $path]
+ RobotWriteRecord $outf $url $distance
+ RobotFileClose $outf
+ }
+ }
+}
+
+proc RobotTextTkl {task url out} {
+ global URL control
+
+ # set title so we can emit it for the body
+ set title {}
+ # if true, nothing will be indexed
+ set noindex 0
+ # if true, nothing will be followed
+ set nofollow 0
+
+ puts $control($task,output)
+
+ set out stdout
+ set distance distance
+
+ htmlSwitch $URL($task,$url,buf) \
+ title {
+ # når title tag er hittet, er body set til indholdet af tagget
+ set title $body
+ } -nonest meta {
+ #puts -nonewline $out "<meta"
+ # al er list med attribut navne som fandtes ind i parm hash
+ set al [array names parm]
+ # løkke igennem attributer
+ foreach a $al {
+ set al [string tolower $a]
+ #puts -nonewline $out " $al"
+ #puts -nonewline $out {="}
+ #puts -nonewline $out $parm($a)
+ #puts -nonewline $out {"}
+ unset parm($al)
+ }
+ #puts $out "></meta>"
+ } body {
+ # don't print title of document content if noindex is used
+ if {!$noindex} {
+ #puts $out "<title>$title</title>"
+ # xml compilancy added
+ set bbody [wellform $body]
+ #puts $out "<documentcontent>"
+ #puts $out $bbody
+ #puts $out "</documentcontent>"
+ }
+ } -nonest base {
+ # <base href=.. >
+ if {![info exists parm(href)]} {
+ continue
+ }
+ set href [string trim $parm(href)]
+ } a {
+ # <a href="...."> .. </a>
+ # we're not using nonest - otherwise body isn't set
+ if {$nofollow} continue
+ if {![info exists parm(href)]} {
+ continue
+ }
+ #puts "link $task $url $out [string trim $parm(href)] $body $distance"
+ } -nonest area {
+ if {$nofollow} continue
+ if {![info exists parm(href)]} {
+ continue
+ }
+ #puts "link $task $url $out [string trim $parm(href)] $body $distance"
+ } -nonest frame {
+ if {![info exists parm(src)]} {
+ continue
+ }
+ #puts "link $task $url $out [string trim $parm(src)] $body $fdistance"
+ }
+}
+
+proc RobotTextHtml {task url out} {
+ global URL control
+
+ # set title so we can emit it for the body
+ set title {}
+ # if true, nothing will be indexed
+ set noindex 0
+ # if true, nothing will be followed
+ set nofollow 0
+
+ set distance 0
+ set fdistance 0
+ if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} {
+ set fdistance $URL($task,$url,dist)
+ set distance [expr $fdistance + 1]
+ }
+ htmlSwitch $URL($task,$url,buf) \
+ title {
+ set title $body
+ } -nonest meta {
+ # collect metadata and save NAME= CONTENT=..
+ set metaname {}
+ set metacontent {}
+ puts -nonewline $out "<meta"
+ set al [array names parm]
+ foreach a $al {
+ set al [string tolower $a]
+ puts -nonewline $out " $al"
+ puts -nonewline $out {="}
+ puts -nonewline $out $parm($a)
+ puts -nonewline $out {"}
+ switch -- $al {
+ "name" {
+ set metaname [string tolower $parm($a)]
+ }
+ "content" {
+ set metacontent $parm($a)
+ }
+ }
+ unset parm($al)
+ }
+ puts $out "></meta>"
+ # go through robots directives (af any)
+ if {![string compare $metaname robots]} {
+ set direcs [split [string tolower $metacontent] ,]
+ if {[lsearch $direcs noindex] >= 0} {
+ set noindex 1
+ }
+ if {[lsearch $direcs nofollow] >= 0} {
+ set nofollow 1
+ }
+ }
+ } body {
+ # don't print title of document content if noindex is used
+ if {!$noindex} {
+ puts $out "<title>$title</title>"
+ set bbody [wellform $body]
+ puts $out "<documentcontent>"
+ puts $out $bbody
+ puts $out "</documentcontent>"
+ }
+ } -nonest base {
+ # <base href=.. >
+ if {![info exists parm(href)]} {
+ continue
+ }
+ set href [string trim $parm(href)]
+ if {![RobotHref $task $url href host path]} continue
+ set URL($task,$url,bpath) $path
+ } a {
+ # <a href="...."> .. </a>
+ # we're not using nonest - otherwise body isn't set
+ if {$nofollow} continue
+ if {![info exists parm(href)]} {
+ continue
+ }
+ link $task $url $out [string trim $parm(href)] $body $distance
+ } -nonest area {
+ if {$nofollow} continue
+ if {![info exists parm(href)]} {
+ continue
+ }
+ link $task $url $out [string trim $parm(href)] $body $distance
+ } -nonest frame {
+ if {![info exists parm(src)]} {
+ continue
+ }
+ link $task $url $out [string trim $parm(src)] $body $fdistance
+ }
+}
+
+proc RobotsTxt {task url} {
+ global agent URL
+
+ RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf)
+}
+
+proc RobotsTxt0 {task v buf} {
+ global URL agent
+ set section 0
+ foreach l [split $buf \n] {
+ if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
+ set arg [string trim $arg]
+ dbgmsg "cmd=$cmd arg=$arg"
+ switch -- [string tolower $cmd] {
+ user-agent {
+ if {$section} break
+ set pat [string tolower $arg]*
+ set section [string match $pat $agent]
+ }
+ disallow {
+ if {$section} {
+ dbgmsg "rule [list 0 $arg]"
+ lappend $v [list 0 $arg]
+ }
+ }
+ allow {
+ if {$section} {
+ dbgmsg "rule [list 1 $arg]"
+ lappend $v [list 1 $arg]
+ }
+ }
+ }
+ }
+ }
+}
+
+proc RobotTextPlain {task url out} {
+ global URL
+
+ puts $out "<documentcontent>"
+ regsub -all {<} $URL($task,$url,buf) {\<} content
+ puts $out $content
+ puts $out "</documentcontent>"
+
+ if {![string compare $URL($task,$url,path) /robots.txt]} {
+ RobotsTxt $task $url
+ }
+}
+
+proc RobotWriteMetadata {task url out} {
+ global URL
+
+ set charset $URL($task,$url,charset)
+ puts $out "<?xml version=\"1.0\" encoding=\"$charset\" standalone=\"yes\"?>"
+ puts $out "<zmbot>"
+
+ set distance 1000
+ if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
+ set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
+ RobotReadRecord $inf fromurl distance
+ RobotFileClose $inf
+ }
+ set URL($task,$url,dist) $distance
+ puts $out "<distance>"
+ puts $out " $distance"
+ puts $out "</distance>"
+ headSave $task $url $out
+ logmsg "Parsing $url distance=$distance"
+ switch $URL($task,$url,head,content-type) {
+ text/html {
+ if {[string length $distance]} {
+ RobotTextHtml $task $url $out
+ RobotTextTkl $task $url $out
+ }
+ }
+ text/plain {
+ RobotTextPlain $task $url $out
+ }
+ }
+ puts $out "</zmbot>"
+}
+
+proc Robot200 {task url} {
+ global URL
+
+ set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)]
+ puts -nonewline $out $URL($task,$url,buf)
+ RobotFileClose $out
+
+ set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)]
+ RobotWriteMetadata $task $url $out
+ RobotFileClose $out
+
+ RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
+}
+
+proc RobotReadContent {task url sock binary} {
+ global URL
+
+ set buffer [read $sock 16384]
+ set readCount [string length $buffer]
+
+ if {$readCount <= 0} {
+ Robot200 $task $url
+ RobotRestart $task $url $sock
+ } elseif {!$binary && [string first \0 $buffer] >= 0} {
+ Robot200 $task $url
+ RobotRestart $task $url $sock
+ } else {
+ # dbgmsg "Got $readCount bytes"
+ set URL($task,$url,buf) $URL($task,$url,buf)$buffer
+ }
+}
+
+proc RobotReadHeader {task url sock} {
+ global URL debuglevel
+
+ if {$debuglevel > 1} {
+ dbgmsg "HTTP head $url"
+ }
+ if {[catch {set buffer [read $sock 2148]}]} {
+ RobotError $task $url 404
+ RobotRestart $task $url $sock
+ return
+ }
+ set readCount [string length $buffer]
+
+ if {$readCount <= 0} {
+ RobotError $task $url 404
+ RobotRestart $task $url $sock
+ } else {
+ # dbgmsg "Got $readCount bytes"
+ set URL($task,$url,buf) $URL($task,$url,buf)$buffer
+
+ set n [string first \r\n\r\n $URL($task,$url,buf)]
+ if {$n > 1} {
+ set code 0
+ set version {}
+ set headbuf [string range $URL($task,$url,buf) 0 $n]
+ incr n 4
+ set URL($task,$url,charset) ISO-8859-1
+ set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end]
+
+ regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
+ set lines [split $headbuf \n]
+ foreach line $lines {
+ if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
+ set URL($task,$url,head,[string tolower $name]) [string trim $value]
+ }
+ regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset)
+ }
+ dbgmsg "HTTP CODE $code"
+ set URL($task,$url,state) skip
+ switch $code {
+ 301 {
+ RobotRedirect $task $url $URL($task,$url,head,location) 301
+ RobotRestart $task $url $sock
+ }
+ 302 {
+ RobotRedirect $task $url $URL($task,$url,head,location) 302
+ RobotRestart $task $url $sock
+ }
+ 200 {
+ if {![info exists URL($task,$url,head,content-type)]} {
+ set URL($task,$url,head,content-type) {}
+ }
+ set binary 1
+ switch -glob -- $URL($task,$url,head,content-type) {
+ text/* {
+ set binary 0
+ }
+ }
+ if {![regexp {/robots.txt$} $url]} {
+ if {![checkrule $task mime $URL($task,$url,head,content-type)]} {
+ RobotError $task $url mimedeny
+ RobotRestart $task $url $sock
+ return
+ }
+ }
+ fileevent $sock readable [list RobotReadContent $task $url $sock $binary]
+ }
+ default {
+ RobotError $task $url $code
+ RobotRestart $task $url $sock
+ }
+ }
+ }
+ }
+}
+
+proc RobotSockCancel {task url sock} {
+
+ logmsg "RobotSockCancel sock=$sock url=$url"
+ RobotError $task $url 401
+ RobotRestart $task $url $sock
+}
+
+proc RobotConnect {task url sock} {
+ global URL agent acceptLanguage
+
+ fconfigure $sock -translation {lf crlf} -blocking 0
+ fileevent $sock readable [list RobotReadHeader $task $url $sock]
+ puts $sock "GET $URL($task,$url,path) HTTP/1.0"
+ puts $sock "Host: $URL($task,$url,host)"
+ puts $sock "User-Agent: $agent"
+ if {[string length $acceptLanguage]} {
+ puts $sock "Accept-Language: $acceptLanguage"
+ }
+ puts $sock ""
+ set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]]
+ if {[catch {flush $sock}]} {
+ RobotError $task $url 404
+ RobotRestart $task $url $sock
+ }
+}
+
+proc RobotNop {} {
+
+}
+
+proc RobotGetUrl {task url phost} {
+ global URL robotsRunning
+ flush stdout
+ dbgmsg "Retrieve running=$robotsRunning url=$url task=$task"
+ if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
+ return -1
+ }
+ if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
+ set port 80
+ set host $hostport
+ }
+ set URL($task,$url,method) $method
+ set URL($task,$url,host) $host
+ set URL($task,$url,hostport) $hostport
+ set URL($task,$url,path) $path
+ set URL($task,$url,state) head
+ set URL($task,$url,buf) {}
+
+ if {[string compare $path /robots.txt]} {
+ set ok 1
+ if {![info exists URL($hostport,robots)]} {
+ dbgmsg "READING robots.txt for host $hostport"
+ if {[RobotFileExist $task visited $hostport /robots.txt]} {
+ set inf [RobotFileOpen $task visited $hostport /robots.txt r]
+ set buf [read $inf 32768]
+ close $inf
+ } else {
+ set buf "User-agent: *\nAllow: /\n"
+ }
+ RobotsTxt0 $task URL($hostport,robots) $buf
+ }
+ if {[info exists URL($hostport,robots)]} {
+ foreach l $URL($hostport,robots) {
+ if {[string first [lindex $l 1] $path] == 0} {
+ set ok [lindex $l 0]
+ break
+ }
+ }
+ }
+ if {!$ok} {
+ dbgmsg "skipped due to robots.txt"
+ return -1
+ }
+ }
+ if [catch {set sock [socket -async $host $port]}] {
+ return -1
+ }
+ RobotConnect $task $url $sock
+
+ return 0
+}
+
+proc loadlib {} {
+ global libdir
+
+ if {![llength [info commands htmlSwitch]]} {
+ if {[info exists env(tclrobot_lib)]} {
+ set d $env(tclrobot_lib)
+ } else {
+ if { $libdir > "" } {
+ set d $libdir
+ } else {
+ set d .
+ }
+ }
+ set e [info sharedlibextension]
+ dbgmsg "About to load $d/tclrobot$e"
+ if {[catch {load $d/tclrobot$e}]} {
+ dbgmsg "Didn't get at $d, trying directly"
+ load tclrobot$e
+ }
+ dbgmsg "Loaded tclrobot$e all right"
+ }
+}
+
+set agent "zmbot/0.2"
+if {![catch {set os [exec uname -s -r]}]} {
+ set agent "$agent ($os)"
+}
+
+dbgmsg "agent: $agent"
+
+proc bgerror {m} {
+ global errorInfo
+ dbgmsg "BGERROR $m"
+ dbgmsg $errorInfo
+}
+
+# Rules: allow, deny, url
+
+proc checkrule {task type this} {
+ global control
+ global debuglevel
+
+ set default_ret 1
+
+ if {$debuglevel > 3} {
+ dbgmsg "CHECKRULE $type $this"
+ }
+ if {[info exist control($task,alrules)]} {
+ foreach l $control($task,alrules) {
+ if {$debuglevel > 3} {
+ dbgmsg "consider $l"
+ }
+ # consider type
+ if {[lindex $l 1] != $type} continue
+ # consider mask (! negates)
+ set masks [lindex $l 2]
+ set ok 0
+ set default_ret 0
+ foreach mask $masks {
+ if {$debuglevel > 4} {
+ dbgmsg "consider single mask $mask"
+ }
+ if {[string index $mask 0] == "!"} {
+ set mask [string range $mask 1 end]
+ if {[string match $mask $this]} continue
+ } else {
+ if {![string match $mask $this]} continue
+ }
+ set ok 1
+ }
+ if {$debuglevel > 4} {
+ dbgmsg "ok = $ok"
+ }
+ if {!$ok} continue
+ # OK, we have a match
+ if {[lindex $l 0] == "allow"} {
+ if {$debuglevel > 3} {
+ dbgmsg "CHECKRULE MATCH OK"
+ }
+ return 1
+ } else {
+ if {$debuglevel > 3} {
+ dbgmsg "CHECKFULE MATCH FAIL"
+ }
+ return 0
+ }
+ }
+ }
+ if {$debuglevel > 3} {
+ dbgmsg "CHECKRULE MATCH DEFAULT $default_ret"
+ }
+ return $default_ret
+}
+
+
+proc url {href} {
+ global debuglevel task
+
+ if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
+ if {![RobotFileExist $task visited $host $path]} {
+ set outf [RobotFileOpen $task unvisited $host $path]
+ RobotWriteRecord $outf href 0
+ RobotFileClose $outf
+ }
+ }
+}
+
+proc deny {type stuff} {
+ global control task
+
+ lappend control($task,alrules) [list deny $type $stuff]
+}
+
+proc allow {type stuff} {
+ global control task
+
+ lappend control($task,alrules) [list allow $type $stuff]
+}
+
+proc debug {level} {
+ global debuglevel
+
+ set debuglevel $level
+}
+
+proc CreateTask {t} {
+ global tasks task status control
+
+ set task $t
+
+ if {[info exists tasks]} {
+ if {[lsearch -exact $tasks $t] >= 0} {
+ return 0
+ }
+ }
+
+ lappend tasks $t
+ set status($t,unvisited) 0
+ set status($t,visited) 0
+ set status($t,bad) 0
+ set status($t,raw) 0
+ set status($t,active) 1
+ set control($t,seq) 0
+ set control($t,distance) 10
+ set control($t,target) tmp
+ set control($t,output) output
+ set control($t,cr) 0
+ return 1
+}
+
+# Little utility that ensures that at least one task is present (main).
+proc CreateMainTask {} {
+ global tasks
+ if {![info exist tasks]} {
+ CreateTask main
+ }
+}
+
+# Parse options
+
+set i 0
+set l [llength $argv]
+
+if {$l < 1} {
+ puts {tclrobot: usage:}
+ puts {tclrobot [-j jobs] [-p pid] [-T tmpdir] [-o logfile] [-i idle] [-c
+ count] [-d domain] [-D spooldir] [-r rules] [-L libdir] [url ..]}
+ logmsg " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
+
+ exit 1
+}
+
+
+while {$i < $l} {
+ set arg [lindex $argv $i]
+ switch -glob -- $arg {
+ -o* {
+ set fname [string range $arg 2 end]
+ if {![string length $fname]} {
+ set fname [lindex $argv [incr i]]
+ }
+ set loghandle [open $fname a]
+ #dbgmsg "agent: $agent"
+ #dbgmsg "-o $fname"
+ }
+ -p* {
+ set pidfname [string range $arg 2 end]
+ if {![string length $pidfname]} {
+ set pidfname [lindex $argv [incr i]]
+ }
+ #dbgmsg "-p $pidfname"
+ if {[file exists $pidfname]} {
+ set pf [open $pidfname]
+ gets $pf oldpid
+ close $pf
+ logmsg "File $pidfname already exist. pid=$oldpid"
+ if {[file isdirectory /proc/$oldpid]} {
+ logmsg "And it's apparently running. Exiting."
+ exit 1
+ }
+ }
+ set pf [open $pidfname w]
+ puts $pf [pid]
+ close $pf
+ }
+ -T* {
+ set tmpdir [string range $arg 2 end]
+ if {![string length $tmpdir]} {
+ set tmpdir [lindex $argv [incr i]]
+ }
+ set control(tmpdir) $tmpdir
+ }
+ -L* {
+ set libdir [string range $arg 2 end]
+ if {![string length $libdir]} {
+ set libdir [lindex $argv [incr i]]
+ }
+ }
+ -t* {
+ set t [string range $arg 2 end]
+ if {![string length $t]} {
+ set t [lindex $argv [incr i]]
+ }
+ CreateTask $t
+ }
+ -D* {
+ set dir [string range $arg 2 end]
+ if {![string length $dir]} {
+ set dir [lindex $argv [incr i]]
+ }
+ lappend daemon_dir $dir
+ }
+ -j* {
+ set robotsMax [string range $arg 2 end]
+ if {![string length $robotsMax]} {
+ set robotsMax [lindex $argv [incr i]]
+ }
+ }
+ -c* {
+ CreateMainTask
+ set control($task,distance) [string range $arg 2 end]
+ if {![string length $control($task,distance)]} {
+ set control($task,distance) [lindex $argv [incr i]]
+ }
+ }
+ -d* {
+ CreateMainTask
+ set dom [string range $arg 2 end]
+ if {![string length $dom]} {
+ set dom [lindex $argv [incr i]]
+ }
+ lappend control($task,domains) $dom
+ }
+ -i* {
+ set idletime [string range $arg 2 end]
+ if {![string length $idletime]} {
+ set idletime [lindex $argv [incr i]]
+ }
+ }
+ -l* {
+ CreateMainTask
+ set acceptLanguage [string range $arg 2 end]
+ if {![string length $acceptLanguage]} {
+ set acceptLanguage [lindex $argv [incr i]]
+ }
+ }
+ -r* {
+ CreateMainTask
+ set rfile [string range $arg 2 end]
+ if {![string length $rfile]} {
+ set rfile [lindex $argv [incr i]]
+ }
+ catch {unset maxdistance}
+ source $rfile
+ if {[info exists maxdistance]} {
+ set control($task,distance) $maxdistance
+ }
+ }
+ default {
+ CreateMainTask
+ set href $arg
+ #dbgmsg "in default: arg= $arg !!!"
+ loadlib
+ if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
+ if {![RobotFileExist $task visited $host $path]} {
+ set outf [RobotFileOpen $task unvisited $host $path]
+ RobotWriteRecord $outf href 0
+ RobotFileClose $outf
+ }
+ }
+ }
+ }
+ incr i
+}
+
+
+dbgmsg "Parsed args, now loading"
+loadlib
+
+if {![info exist robotsMax]} {
+ set robotsMax 5
+}
+
+if {[info exist daemon_dir]} {
+ logmsg "Daemon mode"
+ RobotDaemonLoop
+} else {
+ foreach t $tasks {
+ logmsg "task $t"
+ logmsg "max distance=$control($t,distance)"
+ if {[info exists control($t,domains)]} {
+ logmsg "domains=$control($t,domains)"
+ }
+ }
+ logmsg "max jobs=$robotsMax"
+
+ foreach t $tasks {
+ RobotStart $t
+ }
+
+ while {$robotsRunning} {
+ vwait robotsRunning
+ }
+
+ if {[info exists tasks]} {
+ foreach t $tasks {
+ set statusfile [open $t/status w]
+ puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
+ close $statusfile
+ }
+ }
+}
+