+++ /dev/null
-#!/usr/bin/tclsh
-# $Id: tkl-web-harvester,v 1.1 2003/08/14 08:02:10 marc Exp $
-#
-set loghandle stdout
-set robotsRunning 0
-set workdir [pwd]
-set idletime 15000
-set acceptLanguage {}
-set debuglevel 1
-set libdir ""
-
-
-proc logmsg {msg} {
- global loghandle
-
- puts $loghandle $msg
- flush $loghandle
-}
-
-#proc dbgmsg {level msg} {
-# global debuglevel
-# if {[expr $debuglevel >= $level]} {
-# logmsg $msg
-# }
-#}
-proc dbgmsg {msg} {
- global debuglevel
- if {[expr $debuglevel >= 0]} {
- logmsg $msg
- }
-}
-# dbgmsg is always called with just one string!
-
-
-proc fnameEncode {fname} {
- regsub -all {&} $fname {%38} fname
- regsub -all {<} $fname {%3C} fname
- regsub -all {>} $fname {%3E} fname
- regsub -all {\?} $fname {%3F} fname
- regsub -all {\*} $fname {%2A} fname
- return $fname
-}
-
-proc fnameDecode {fname} {
- regsub -all {%38} $fname {&} fname
- regsub -all {%3C} $fname {<} fname
- regsub -all {%3E} $fname {>} fname
- regsub -all {%3F} $fname {?} fname
- regsub -all {%2A} $fname {*} fname
- return $fname
-}
-
-proc RobotFileNext1 {area lead} {
- # dbgmsg "RobotFileNext1 area=$area lead=$lead"
- if {[catch {set ns [glob ${area}/*]}]} {
- return {}
- }
- foreach n $ns {
- if {[file isfile $n]} {
- set off [string last / $n]
- # skip /
- incr off
- set end [string length $n]
- # skip _.tkl
- incr end -6
- return $lead/[string range $n $off $end]
- }
- }
- foreach n $ns {
- if {[file isdirectory $n]} {
- set off [string last / $n]
- # skip /
- incr off
- set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
- if {[string length $sb]} {
- return $sb
- }
- }
- }
- return {}
-}
-
-proc RobotWriteRecord {outf fromurl distance} {
- puts $outf {<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>}
- puts $outf "<zmbot>"
- puts $outf "<distance>"
- puts $outf $distance
- puts $outf "</distance>"
- puts $outf "<fromurl>"
- puts $outf $fromurl
- puts $outf "</fromurl>"
- puts $outf "</zmbot>"
-}
-
-proc RobotReadRecord {inf fromurlx distancex} {
- upvar $fromurlx fromurl
- upvar $distancex distance
- gets $inf
- gets $inf
- gets $inf
- set distance [string trim [gets $inf]]
- # dbgmsg "got distance = $distance"
- gets $inf
- gets $inf
- set fromurl [string trim [gets $inf]]
-}
-
-proc RobotFileNext {task area} {
- global control
- global idletime ns
- global status
-
- # dbgmsg "RobotFileNext seq=$control($task,seq)"
- if {$control($task,seq) < 0} {
- return {}
- }
- set target $control($task,target)
- if {$control($task,seq) == 0} {
- if {[catch {set ns($task) [glob $target/$area/*]}]} {
- puts "----------- DONE-------- target=$target"
- return done
- }
- }
- # dbgmsg "ns=$ns($task)"
- set off [string length $target/$area]
- incr off
- set n [lindex $ns($task) $control($task,seq)]
- # dbgmsg "n=$n"
- if {![string length $n]} {
- set control($task,seq) -1
- set statusfile [open $target/status w]
- puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
- close $statusfile
- return wait
- }
- incr control($task,seq)
- if {[file isfile $n/robots.txt_.tkl]} {
- # dbgmsg "ok returning http://[string range $n $off end]/robots.txt"
- return [fnameDecode http://[string range $n $off end]/robots.txt]
- } elseif {[file isdirectory $n]} {
- set sb [RobotFileNext1 $n http://[string range $n $off end]]
- if {[string length $sb]} {
- return [fnameDecode $sb]
- }
- }
- dbgmsg "no more work at end of RobotFileNext n=$n"
- dbgmsg "ns=$ns($task)"
- return {}
-}
-
-
-proc RobotFileExist {task area host path} {
- global debuglevel control
-
- if {$debuglevel > 3} {
- dbgmsg "RobotFileExist begin area=$area host=$host path=$path"
- }
- set target $control($task,target)
- return [file exists [fnameEncode $target/$area/$host${path}_.tkl]]
-}
-
-proc RobotFileUnlink {task area host path} {
- global status control
-
- set target $control($task,target)
- # dbgmsg "RobotFileUnlink begin"
- # dbgmsg "area=$area host=$host path=$path"
- set npath [fnameEncode $target/$area/$host${path}_.tkl]
- # dbgmsg "npath=$npath"
- set comp [split $npath /]
- if {[catch {exec rm $npath}]} return
-
- set l [llength $comp]
- incr l -2
- incr status($task,$area) -1
- for {set i $l} {$i > 0} {incr i -1} {
- set path [join [lrange $comp 0 $i] /]
- if {![catch {glob $path/*}]} return
- exec rmdir $path
- }
- # dbgmsg "RobotFileUnlink end"
-}
-
-proc RobotFileClose {out} {
- if [string compare $out stdout] {
- close $out
- }
-}
-
-proc RobotFileOpen {task area host path {mode w}} {
- set orgPwd [pwd]
- global workdir status debuglevel control
-
- # dbgmsg "RobotFileOpen task=$task path=$path"
-
- set target $control($task,target)
- set path [fnameEncode $path]
-
- if {![info exists workdir]} {
- return stdout
- }
- if {$debuglevel > 3} {
- dbgmsg "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
- }
- if {[string compare $orgPwd $workdir]} {
- dbgmsg "ooops. RobotFileOpen failed"
- dbgmsg "workdir = $workdir"
- dbgmsg "pwd = $orgPwd"
- exit 1
- }
-
- set comp [split $target/$area/$host /]
- set len [llength $comp]
- incr len -1
-
- # dbgmsg "1 comp=$comp"
-
- for {set i 0} {$i <= $len} {incr i} {
- set d [lindex $comp $i]
- if {[string length $d] == 0} {
- cd /
- } elseif {[catch {cd $d}]} {
- exec mkdir $d
- cd ./$d
- if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
- if {[string compare $path /robots.txt]} {
- set out [open robots.txt_.tkl w]
- dbgmsg "creating robots.txt in $d"
- close $out
- incr status($task,unvisited)
- }
- }
- }
- }
-
- set comp [split $path /]
- set len [llength $comp]
- incr len -1
-
- # dbgmsg "2 path=$path comp=$comp"
-
- for {set i 0} {$i < $len} {incr i} {
- set d [lindex $comp $i]
- if {[string length $d] > 0} {
- if {[catch {cd $d}]} {
- exec mkdir $d
- cd ./$d
- }
- }
- }
- set d [lindex $comp $len]
- set out [open ${d}_.tkl $mode]
- if {$mode == "w"} {
- incr status($task,$area)
- }
- cd $orgPwd
- return $out
-}
-
-proc RobotStartJob {root task} {
- global control
-
- set fname "$root$task"
- set f [open $fname r]
- set xml [read $f]
- dbgmsg "Reading $fname"
- close $f
- # task type must be 2
- if {![regexp {<tasktype>([^<]*)</tasktype>} $xml x tasktype]} {
- return
- }
- set tasktype [string trim $tasktype]
- if {![string match 2 $tasktype]} {
- return
- }
- # status must not be finished or error
- if {![regexp {<status>([^<]*)</status>} $xml x status]} {
- return
- }
- if {$status == "finished"} {
- dbgmsg "already finished"
- return
- }
- if {$status == "error"} {
- dbgmsg "already finished due to error"
- return
- }
- # ignore if task has already been processed
- dbgmsg "status = $status"
- if {![CreateTask $task]} {
- return
- }
- set control($task,taskfname) $fname
- dbgmsg "Reading $fname stage 2"
- htmlSwitch $xml \
- url {
- lappend starturls $body
- } filter {
- set type $parm(type)
- set action $parm(action)
- if {$type == "domain"} {
- $action url http://$body/*
- }
- if {$type == "url"} {
- $action url $body
- }
- if {$type == "mime"} {
- $action mime $body
- }
- } target {
- set ex [file rootname [file tail $task]]
- #set control($task,target) "$root$body/$ex"
- set control($task,target) "$control(tmpdir)/$ex"
- set control($task,output) "$root$body"
- } distance {
- set control($task,distance) $body
- } status {
- set control($task,filestatus) $body
- } tasktype {
- set control($task,tasktype) $body
- }
-
- if {[info exists starturls]} {
- foreach url $starturls {
- puts "marking start urls $url"
- url $url
- }
- }
-
- if {$status == "pending"} {
- regsub {<status>[^<]*</status>} $xml {<status>running</status>} xml2
- set f [open $fname w]
- puts -nonewline $f $xml2
- close $f
- }
-}
-
-proc RobotDoneJob {task} {
- global daemon_dir control
-
- if {![info exists daemon_dir]} {
- return
- }
- set fname $control($task,taskfname)
- set f [open $fname r]
- set xml [read $f]
- dbgmsg "Reading $fname"
- regexp {<status>([^<]*)</status>} $xml x status
- dbgmsg "------"
- dbgmsg "status = $status"
- close $f
-
- regsub {<status>[^<]*</status>} $xml {<status>finished</status>} xml2
- set f [open $fname w]
- puts -nonewline $f $xml2
- close $f
-}
-
-proc RobotScanDir {} {
- global daemon_dir
-
- if {![info exists daemon_dir]} {
- return
- }
- foreach d $daemon_dir {
- if {[catch {set files [glob $d/*.spl]}]} {
- return
- }
- foreach fname $files {
- if {[file isfile $fname] && [file readable $fname]} {
- set jobfile [open $fname]
- gets $jobfile portalroot
- gets $jobfile portaltask
- close $jobfile
-
- RobotStartJob $portalroot $portaltask
- }
- }
- }
-}
-
-proc RobotRR {task} {
- global control robotsRunning tasks robotsMax status
-
- dbgmsg "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
- incr robotsRunning -1
-
- # only one task gets through...
- if {[string compare [lindex $tasks 0] $task]} {
- return
- }
- dbgmsg "RobotRR. task = $task"
- while {$robotsRunning} {
- vwait robotsRunning
- }
- dbgmsg "Scan"
- if {[catch {RobotScanDir} msg]} {
- logmsg "RobotScanDir failed"
- logmsg $msg
- }
- foreach t $tasks {
- set target $control($t,target)
- set statusfile [open $target/status w]
- puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
- close $statusfile
- set control($t,seq) 0
- RobotStart $t
- }
-}
-
-proc RobotDaemonSig {} {
- global daemon_cnt
-
- incr daemon_cnt
-}
-
-proc RobotDaemonLoop {} {
- global daemon_cnt tasks robotsRunning status
-
- set daemon_cnt 0
- while 1 {
- logmsg $daemon_cnt
- RobotScanDir
-
- if {[info exists tasks]} {
- logmsg "daemon loop tasks $tasks"
- foreach t $tasks {
- set control($t,seq) 0
- RobotStart $t
- }
- while {$robotsRunning} {
- vwait robotsRunning
- }
- }
- after 30000 RobotDaemonSig
- vwait daemon_cnt
- }
-}
-
-proc RobotRestart {task url sock} {
- global URL robotsRunning
-
- close $sock
- after cancel $URL($sock,cancel)
-
- foreach v [array names URL $task,$url,*] {
- unset URL($v)
- }
-
- incr robotsRunning -1
- RobotStart $task
-}
-
-proc RobotStart {task} {
- global URL
- global robotsRunning robotsMax idletime status tasks
-
- # dbgmsg "RobotStart $task running=$robotsRunning"
- while {1} {
- set url [RobotFileNext $task unvisited]
- if {[string compare $url done] == 0} {
- dbgmsg "In RobotStart task $task done"
-
- catch {unset ntasks}
- foreach t $tasks {
- if {[string compare $t $task]} {
- lappend ntasks $t
- } else {
- dbgmsg "task $t done"
- }
- }
- if {![info exists ntasks]} {
- unset tasks
- dbgmsg "all done"
- } else {
- set tasks $ntasks
- }
- RobotDoneJob $task
- return
- }
- if {![string length $url]} {
- return
- }
- incr robotsRunning
- if {[string compare $url wait] == 0} {
- after $idletime [list RobotRR $task]
- return
- }
- set r [RobotGetUrl $task $url {}]
- if {!$r} {
- if {$robotsRunning >= $robotsMax} return
- } else {
- incr robotsRunning -1
- if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
- set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
- RobotFileClose $outf
- }
- RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
- }
- }
-}
-
-proc headSave {task url out} {
- global URL
-
- if {[info exists URL($task,$url,head,last-modified)]} {
- puts $out "<lastmodified>$URL($task,$url,head,last-modified)</lastmodified>"
- }
- puts $out {<si>}
- if {[info exists URL($task,$url,head,date)]} {
- puts $out " <date>$URL($task,$url,head,date)</date>"
- }
- if {[info exists URL($task,$url,head,content-length)]} {
- puts $out " <by>$URL($task,$url,head,content-length)</by>"
- }
- if {[info exists URL($task,$url,head,server)]} {
- puts $out " <format>$URL($task,$url,head,server)</format>"
- }
- puts $out {</si>}
- puts $out {<publisher>}
- puts $out " <identifier>$url</identifier>"
- if {[info exists URL($task,$url,head,content-type)]} {
- puts $out " <type>$URL($task,$url,head,content-type)</type>"
- }
- puts $out {</publisher>}
-}
-
-proc RobotHref {task url hrefx hostx pathx} {
- global URL control debuglevel
- upvar $hrefx href
- upvar $hostx host
- upvar $pathx path
-
- if {$debuglevel > 1} {
- dbgmsg "Ref input url = $url href=$href"
- }
-
- if {[string first { } $href] >= 0} {
- return 0
- }
- if {[string length $href] > 256} {
- return 0
- }
-
-# Skip pages that have ? in them
-# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
-# return 0
-# }
- # get method (if any)
- if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
- set hpath $href
- set method http
- } else {
- if {[string compare $method http]} {
- return 0
- }
- }
- # get host (if any)
- if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
- if {![string length $surl]} {
- set surl /
- }
- if {[info exist control($task,domains)]} {
- set ok 0
- foreach domain $control($task,domains) {
- if {[string match $domain $host]} {
- set ok 1
- break
- }
- }
- if {!$ok} {
- return 0
- }
- }
- } else {
- regexp {^([^\#]*)} $hpath x surl
- set host $URL($task,$url,hostport)
- }
- if {![string length $surl]} {
- return 0
- }
- if {[string first / $surl]} {
- # relative path
- set curpath $URL($task,$url,path)
- if {[info exists URL($task,$url,bpath)]} {
- set curpath $URL($task,$url,bpath)
- }
- regexp {^([^\#?]*)} $curpath x dpart
- set l [string last / $dpart]
- if {[expr $l >= 0]} {
- set surl [string range $dpart 0 $l]$surl
- } else {
- set surl $dpart/$surl
- }
- }
- set surllist [split $surl /]
- catch {unset path}
- set pathl 0
- foreach c $surllist {
- switch -- $c {
- .. {
- if {$pathl > 1} {
- incr pathl -2
- set path [lrange $path 0 $pathl]
- incr pathl
- }
- }
- . {
-
- }
- default {
- incr pathl
- lappend path $c
- }
- }
- }
- if {$debuglevel > 4} {
- dbgmsg "pathl=$pathl output path=$path"
- }
- set path [join $path /]
- if {![string length $path]} {
- set path /
- }
- regsub -all {~} $path {%7E} path
- set href "$method://$host$path"
-
- if {$debuglevel > 1} {
- dbgmsg "Ref result = $href"
- }
- return [checkrule $task url $href]
-}
-
-proc RobotError {task url code} {
- global URL
-
- dbgmsg "Bad URL $url (code $code)"
- set fromurl {}
- set distance -1
- if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
- set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
- RobotReadRecord $inf fromurl distance
- RobotFileClose $inf
- }
- RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
- if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
- set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
- RobotWriteRecord $outf $fromurl $distance
- RobotFileClose $outf
- }
-}
-
-proc RobotRedirect {task url tourl code} {
- global URL
-
- dbgmsg "Redirecting from $url to $tourl"
-
- set distance {}
- set fromurl {}
- if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
- set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
- RobotReadRecord $inf fromurl distance
- RobotFileClose $inf
- }
- if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
- set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
- RobotWriteRecord $outf $fromurl $distance
- RobotFileClose $outf
- }
- if {[RobotHref $task $url tourl host path]} {
- if {![RobotFileExist $task visited $host $path]} {
- if {![RobotFileExist $task unvisited $host $path]} {
- set outf [RobotFileOpen $task unvisited $host $path]
- RobotWriteRecord $outf $fromurl $distance
- RobotFileClose $outf
- }
- } else {
- set olddistance {}
- set inf [RobotFileOpen $task visited $host $path r]
- RobotReadRecord $inf oldurl olddistance
- RobotFileClose $inf
- if {[string length $olddistance] == 0} {
- set olddistance 1000
- }
- if {[string length $distance] == 0} {
- set distance 1000
- }
- dbgmsg "distance=$distance olddistance=$olddistance"
- if {[expr $distance < $olddistance]} {
- set outf [RobotFileOpen $task unvisited $host $path]
- RobotWriteRecord $outf $tourl $distance
- RobotFileClose $outf
- }
- }
- }
- if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} {
- dbgmsg "unlink failed"
- exit 1
- }
-}
-
-proc wellform {body} {
- regsub -all {<!--[^-]*-->} $body { } abody
- regsub -all -nocase {<script[^<]*</script>} $abody {} body
- regsub -all {<[^\>]+>} $body {} abody
- regsub -all { } $abody { } body
- regsub -all {&} $body {&} abody
- return $abody
-}
-
-proc link {task url out href body distance} {
- global URL control
- if {[expr $distance > $control($task,distance)]} return
-
- if {![RobotHref $task $url href host path]} return
-
- if ($control($task,cr)) {
- puts $out "<cr>"
- puts $out "<identifier>$href</identifier>"
- set abody [wellform $body]
- puts $out "<description>$abody</description>"
- puts $out "</cr>"
- }
-
- if {![RobotFileExist $task visited $host $path]} {
- set olddistance 1000
- if {![RobotFileExist $task bad $host $path]} {
- if {[RobotFileExist $task unvisited $host $path]} {
- set inf [RobotFileOpen $task unvisited $host $path r]
- RobotReadRecord $inf oldurl olddistance
- RobotFileClose $inf
- }
- } else {
- set olddistance 0
- }
- if {[string length $olddistance] == 0} {
- set olddistance 1000
- }
- if {[expr $distance < $olddistance]} {
- set outf [RobotFileOpen $task unvisited $host $path]
- RobotWriteRecord $outf $url $distance
- RobotFileClose $outf
- }
- } elseif {[string compare $href $url]} {
- set inf [RobotFileOpen $task visited $host $path r]
- RobotReadRecord $inf xurl olddistance
- close $inf
- if {[string length $olddistance] == 0} {
- set olddistance 1000
- }
- if {[expr $distance < $olddistance]} {
- dbgmsg "OK remarking url=$url href=$href"
- dbgmsg "olddistance = $olddistance"
- dbgmsg "newdistance = $distance"
- set outf [RobotFileOpen $task unvisited $host $path]
- RobotWriteRecord $outf $url $distance
- RobotFileClose $outf
- }
- }
-}
-
-proc RobotTextTkl {task url out} {
- global URL control
-
- # set title so we can emit it for the body
- set title {}
- # if true, nothing will be indexed
- set noindex 0
- # if true, nothing will be followed
- set nofollow 0
-
- puts $control($task,output)
-
- set out stdout
- set distance distance
-
- htmlSwitch $URL($task,$url,buf) \
- title {
- # når title tag er hittet, er body set til indholdet af tagget
- set title $body
- } -nonest meta {
- #puts -nonewline $out "<meta"
- # al er list med attribut navne som fandtes ind i parm hash
- set al [array names parm]
- # løkke igennem attributer
- foreach a $al {
- set al [string tolower $a]
- #puts -nonewline $out " $al"
- #puts -nonewline $out {="}
- #puts -nonewline $out $parm($a)
- #puts -nonewline $out {"}
- unset parm($al)
- }
- #puts $out "></meta>"
- } body {
- # don't print title of document content if noindex is used
- if {!$noindex} {
- #puts $out "<title>$title</title>"
- # xml compilancy added
- set bbody [wellform $body]
- #puts $out "<documentcontent>"
- #puts $out $bbody
- #puts $out "</documentcontent>"
- }
- } -nonest base {
- # <base href=.. >
- if {![info exists parm(href)]} {
- continue
- }
- set href [string trim $parm(href)]
- } a {
- # <a href="...."> .. </a>
- # we're not using nonest - otherwise body isn't set
- if {$nofollow} continue
- if {![info exists parm(href)]} {
- continue
- }
- #puts "link $task $url $out [string trim $parm(href)] $body $distance"
- } -nonest area {
- if {$nofollow} continue
- if {![info exists parm(href)]} {
- continue
- }
- #puts "link $task $url $out [string trim $parm(href)] $body $distance"
- } -nonest frame {
- if {![info exists parm(src)]} {
- continue
- }
- #puts "link $task $url $out [string trim $parm(src)] $body $fdistance"
- }
-}
-
-proc RobotTextHtml {task url out} {
- global URL control
-
- # set title so we can emit it for the body
- set title {}
- # if true, nothing will be indexed
- set noindex 0
- # if true, nothing will be followed
- set nofollow 0
-
- set distance 0
- set fdistance 0
- if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} {
- set fdistance $URL($task,$url,dist)
- set distance [expr $fdistance + 1]
- }
- htmlSwitch $URL($task,$url,buf) \
- title {
- set title $body
- } -nonest meta {
- # collect metadata and save NAME= CONTENT=..
- set metaname {}
- set metacontent {}
- puts -nonewline $out "<meta"
- set al [array names parm]
- foreach a $al {
- set al [string tolower $a]
- puts -nonewline $out " $al"
- puts -nonewline $out {="}
- puts -nonewline $out $parm($a)
- puts -nonewline $out {"}
- switch -- $al {
- "name" {
- set metaname [string tolower $parm($a)]
- }
- "content" {
- set metacontent $parm($a)
- }
- }
- unset parm($al)
- }
- puts $out "></meta>"
- # go through robots directives (af any)
- if {![string compare $metaname robots]} {
- set direcs [split [string tolower $metacontent] ,]
- if {[lsearch $direcs noindex] >= 0} {
- set noindex 1
- }
- if {[lsearch $direcs nofollow] >= 0} {
- set nofollow 1
- }
- }
- } body {
- # don't print title of document content if noindex is used
- if {!$noindex} {
- puts $out "<title>$title</title>"
- set bbody [wellform $body]
- puts $out "<documentcontent>"
- puts $out $bbody
- puts $out "</documentcontent>"
- }
- } -nonest base {
- # <base href=.. >
- if {![info exists parm(href)]} {
- continue
- }
- set href [string trim $parm(href)]
- if {![RobotHref $task $url href host path]} continue
- set URL($task,$url,bpath) $path
- } a {
- # <a href="...."> .. </a>
- # we're not using nonest - otherwise body isn't set
- if {$nofollow} continue
- if {![info exists parm(href)]} {
- continue
- }
- link $task $url $out [string trim $parm(href)] $body $distance
- } -nonest area {
- if {$nofollow} continue
- if {![info exists parm(href)]} {
- continue
- }
- link $task $url $out [string trim $parm(href)] $body $distance
- } -nonest frame {
- if {![info exists parm(src)]} {
- continue
- }
- link $task $url $out [string trim $parm(src)] $body $fdistance
- }
-}
-
-proc RobotsTxt {task url} {
- global agent URL
-
- RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf)
-}
-
-proc RobotsTxt0 {task v buf} {
- global URL agent
- set section 0
- foreach l [split $buf \n] {
- if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
- set arg [string trim $arg]
- dbgmsg "cmd=$cmd arg=$arg"
- switch -- [string tolower $cmd] {
- user-agent {
- if {$section} break
- set pat [string tolower $arg]*
- set section [string match $pat $agent]
- }
- disallow {
- if {$section} {
- dbgmsg "rule [list 0 $arg]"
- lappend $v [list 0 $arg]
- }
- }
- allow {
- if {$section} {
- dbgmsg "rule [list 1 $arg]"
- lappend $v [list 1 $arg]
- }
- }
- }
- }
- }
-}
-
-proc RobotTextPlain {task url out} {
- global URL
-
- puts $out "<documentcontent>"
- regsub -all {<} $URL($task,$url,buf) {\<} content
- puts $out $content
- puts $out "</documentcontent>"
-
- if {![string compare $URL($task,$url,path) /robots.txt]} {
- RobotsTxt $task $url
- }
-}
-
-proc RobotWriteMetadata {task url out} {
- global URL
-
- set charset $URL($task,$url,charset)
- puts $out "<?xml version=\"1.0\" encoding=\"$charset\" standalone=\"yes\"?>"
- puts $out "<zmbot>"
-
- set distance 1000
- if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
- set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
- RobotReadRecord $inf fromurl distance
- RobotFileClose $inf
- }
- set URL($task,$url,dist) $distance
- puts $out "<distance>"
- puts $out " $distance"
- puts $out "</distance>"
- headSave $task $url $out
- logmsg "Parsing $url distance=$distance"
- switch $URL($task,$url,head,content-type) {
- text/html {
- if {[string length $distance]} {
- RobotTextHtml $task $url $out
- RobotTextTkl $task $url $out
- }
- }
- text/plain {
- RobotTextPlain $task $url $out
- }
- }
- puts $out "</zmbot>"
-}
-
-proc Robot200 {task url} {
- global URL
-
- set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)]
- puts -nonewline $out $URL($task,$url,buf)
- RobotFileClose $out
-
- set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)]
- RobotWriteMetadata $task $url $out
- RobotFileClose $out
-
- RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
-}
-
-proc RobotReadContent {task url sock binary} {
- global URL
-
- set buffer [read $sock 16384]
- set readCount [string length $buffer]
-
- if {$readCount <= 0} {
- Robot200 $task $url
- RobotRestart $task $url $sock
- } elseif {!$binary && [string first \0 $buffer] >= 0} {
- Robot200 $task $url
- RobotRestart $task $url $sock
- } else {
- # dbgmsg "Got $readCount bytes"
- set URL($task,$url,buf) $URL($task,$url,buf)$buffer
- }
-}
-
-proc RobotReadHeader {task url sock} {
- global URL debuglevel
-
- if {$debuglevel > 1} {
- dbgmsg "HTTP head $url"
- }
- if {[catch {set buffer [read $sock 2148]}]} {
- RobotError $task $url 404
- RobotRestart $task $url $sock
- return
- }
- set readCount [string length $buffer]
-
- if {$readCount <= 0} {
- RobotError $task $url 404
- RobotRestart $task $url $sock
- } else {
- # dbgmsg "Got $readCount bytes"
- set URL($task,$url,buf) $URL($task,$url,buf)$buffer
-
- set n [string first \r\n\r\n $URL($task,$url,buf)]
- if {$n > 1} {
- set code 0
- set version {}
- set headbuf [string range $URL($task,$url,buf) 0 $n]
- incr n 4
- set URL($task,$url,charset) ISO-8859-1
- set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end]
-
- regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
- set lines [split $headbuf \n]
- foreach line $lines {
- if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
- set URL($task,$url,head,[string tolower $name]) [string trim $value]
- }
- regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset)
- }
- dbgmsg "HTTP CODE $code"
- set URL($task,$url,state) skip
- switch $code {
- 301 {
- RobotRedirect $task $url $URL($task,$url,head,location) 301
- RobotRestart $task $url $sock
- }
- 302 {
- RobotRedirect $task $url $URL($task,$url,head,location) 302
- RobotRestart $task $url $sock
- }
- 200 {
- if {![info exists URL($task,$url,head,content-type)]} {
- set URL($task,$url,head,content-type) {}
- }
- set binary 1
- switch -glob -- $URL($task,$url,head,content-type) {
- text/* {
- set binary 0
- }
- }
- if {![regexp {/robots.txt$} $url]} {
- if {![checkrule $task mime $URL($task,$url,head,content-type)]} {
- RobotError $task $url mimedeny
- RobotRestart $task $url $sock
- return
- }
- }
- fileevent $sock readable [list RobotReadContent $task $url $sock $binary]
- }
- default {
- RobotError $task $url $code
- RobotRestart $task $url $sock
- }
- }
- }
- }
-}
-
-proc RobotSockCancel {task url sock} {
-
- logmsg "RobotSockCancel sock=$sock url=$url"
- RobotError $task $url 401
- RobotRestart $task $url $sock
-}
-
-proc RobotConnect {task url sock} {
- global URL agent acceptLanguage
-
- fconfigure $sock -translation {lf crlf} -blocking 0
- fileevent $sock readable [list RobotReadHeader $task $url $sock]
- puts $sock "GET $URL($task,$url,path) HTTP/1.0"
- puts $sock "Host: $URL($task,$url,host)"
- puts $sock "User-Agent: $agent"
- if {[string length $acceptLanguage]} {
- puts $sock "Accept-Language: $acceptLanguage"
- }
- puts $sock ""
- set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]]
- if {[catch {flush $sock}]} {
- RobotError $task $url 404
- RobotRestart $task $url $sock
- }
-}
-
-proc RobotNop {} {
-
-}
-
-proc RobotGetUrl {task url phost} {
- global URL robotsRunning
- flush stdout
- dbgmsg "Retrieve running=$robotsRunning url=$url task=$task"
- if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
- return -1
- }
- if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
- set port 80
- set host $hostport
- }
- set URL($task,$url,method) $method
- set URL($task,$url,host) $host
- set URL($task,$url,hostport) $hostport
- set URL($task,$url,path) $path
- set URL($task,$url,state) head
- set URL($task,$url,buf) {}
-
- if {[string compare $path /robots.txt]} {
- set ok 1
- if {![info exists URL($hostport,robots)]} {
- dbgmsg "READING robots.txt for host $hostport"
- if {[RobotFileExist $task visited $hostport /robots.txt]} {
- set inf [RobotFileOpen $task visited $hostport /robots.txt r]
- set buf [read $inf 32768]
- close $inf
- } else {
- set buf "User-agent: *\nAllow: /\n"
- }
- RobotsTxt0 $task URL($hostport,robots) $buf
- }
- if {[info exists URL($hostport,robots)]} {
- foreach l $URL($hostport,robots) {
- if {[string first [lindex $l 1] $path] == 0} {
- set ok [lindex $l 0]
- break
- }
- }
- }
- if {!$ok} {
- dbgmsg "skipped due to robots.txt"
- return -1
- }
- }
- if [catch {set sock [socket -async $host $port]}] {
- return -1
- }
- RobotConnect $task $url $sock
-
- return 0
-}
-
-proc loadlib {} {
- global libdir
-
- if {![llength [info commands htmlSwitch]]} {
- if {[info exists env(tclrobot_lib)]} {
- set d $env(tclrobot_lib)
- } else {
- if { $libdir > "" } {
- set d $libdir
- } else {
- set d .
- }
- }
- set e [info sharedlibextension]
- dbgmsg "About to load $d/tclrobot$e"
- if {[catch {load $d/tclrobot$e}]} {
- dbgmsg "Didn't get at $d, trying directly"
- load tclrobot$e
- }
- dbgmsg "Loaded tclrobot$e all right"
- }
-}
-
-set agent "zmbot/0.2"
-if {![catch {set os [exec uname -s -r]}]} {
- set agent "$agent ($os)"
-}
-
-dbgmsg "agent: $agent"
-
-proc bgerror {m} {
- global errorInfo
- dbgmsg "BGERROR $m"
- dbgmsg $errorInfo
-}
-
-# Rules: allow, deny, url
-
-proc checkrule {task type this} {
- global control
- global debuglevel
-
- set default_ret 1
-
- if {$debuglevel > 3} {
- dbgmsg "CHECKRULE $type $this"
- }
- if {[info exist control($task,alrules)]} {
- foreach l $control($task,alrules) {
- if {$debuglevel > 3} {
- dbgmsg "consider $l"
- }
- # consider type
- if {[lindex $l 1] != $type} continue
- # consider mask (! negates)
- set masks [lindex $l 2]
- set ok 0
- set default_ret 0
- foreach mask $masks {
- if {$debuglevel > 4} {
- dbgmsg "consider single mask $mask"
- }
- if {[string index $mask 0] == "!"} {
- set mask [string range $mask 1 end]
- if {[string match $mask $this]} continue
- } else {
- if {![string match $mask $this]} continue
- }
- set ok 1
- }
- if {$debuglevel > 4} {
- dbgmsg "ok = $ok"
- }
- if {!$ok} continue
- # OK, we have a match
- if {[lindex $l 0] == "allow"} {
- if {$debuglevel > 3} {
- dbgmsg "CHECKRULE MATCH OK"
- }
- return 1
- } else {
- if {$debuglevel > 3} {
- dbgmsg "CHECKFULE MATCH FAIL"
- }
- return 0
- }
- }
- }
- if {$debuglevel > 3} {
- dbgmsg "CHECKRULE MATCH DEFAULT $default_ret"
- }
- return $default_ret
-}
-
-
-proc url {href} {
- global debuglevel task
-
- if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
- if {![RobotFileExist $task visited $host $path]} {
- set outf [RobotFileOpen $task unvisited $host $path]
- RobotWriteRecord $outf href 0
- RobotFileClose $outf
- }
- }
-}
-
-proc deny {type stuff} {
- global control task
-
- lappend control($task,alrules) [list deny $type $stuff]
-}
-
-proc allow {type stuff} {
- global control task
-
- lappend control($task,alrules) [list allow $type $stuff]
-}
-
-proc debug {level} {
- global debuglevel
-
- set debuglevel $level
-}
-
-proc CreateTask {t} {
- global tasks task status control
-
- set task $t
-
- if {[info exists tasks]} {
- if {[lsearch -exact $tasks $t] >= 0} {
- return 0
- }
- }
-
- lappend tasks $t
- set status($t,unvisited) 0
- set status($t,visited) 0
- set status($t,bad) 0
- set status($t,raw) 0
- set status($t,active) 1
- set control($t,seq) 0
- set control($t,distance) 10
- set control($t,target) tmp
- set control($t,output) output
- set control($t,cr) 0
- return 1
-}
-
-# Little utility that ensures that at least one task is present (main).
-proc CreateMainTask {} {
- global tasks
- if {![info exist tasks]} {
- CreateTask main
- }
-}
-
-# Parse options
-
-set i 0
-set l [llength $argv]
-
-if {$l < 1} {
- puts {tclrobot: usage:}
- puts {tclrobot [-j jobs] [-p pid] [-T tmpdir] [-o logfile] [-i idle] [-c
- count] [-d domain] [-D spooldir] [-r rules] [-L libdir] [url ..]}
- logmsg " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
-
- exit 1
-}
-
-
-while {$i < $l} {
- set arg [lindex $argv $i]
- switch -glob -- $arg {
- -o* {
- set fname [string range $arg 2 end]
- if {![string length $fname]} {
- set fname [lindex $argv [incr i]]
- }
- set loghandle [open $fname a]
- #dbgmsg "agent: $agent"
- #dbgmsg "-o $fname"
- }
- -p* {
- set pidfname [string range $arg 2 end]
- if {![string length $pidfname]} {
- set pidfname [lindex $argv [incr i]]
- }
- #dbgmsg "-p $pidfname"
- if {[file exists $pidfname]} {
- set pf [open $pidfname]
- gets $pf oldpid
- close $pf
- logmsg "File $pidfname already exist. pid=$oldpid"
- if {[file isdirectory /proc/$oldpid]} {
- logmsg "And it's apparently running. Exiting."
- exit 1
- }
- }
- set pf [open $pidfname w]
- puts $pf [pid]
- close $pf
- }
- -T* {
- set tmpdir [string range $arg 2 end]
- if {![string length $tmpdir]} {
- set tmpdir [lindex $argv [incr i]]
- }
- set control(tmpdir) $tmpdir
- }
- -L* {
- set libdir [string range $arg 2 end]
- if {![string length $libdir]} {
- set libdir [lindex $argv [incr i]]
- }
- }
- -t* {
- set t [string range $arg 2 end]
- if {![string length $t]} {
- set t [lindex $argv [incr i]]
- }
- CreateTask $t
- }
- -D* {
- set dir [string range $arg 2 end]
- if {![string length $dir]} {
- set dir [lindex $argv [incr i]]
- }
- lappend daemon_dir $dir
- }
- -j* {
- set robotsMax [string range $arg 2 end]
- if {![string length $robotsMax]} {
- set robotsMax [lindex $argv [incr i]]
- }
- }
- -c* {
- CreateMainTask
- set control($task,distance) [string range $arg 2 end]
- if {![string length $control($task,distance)]} {
- set control($task,distance) [lindex $argv [incr i]]
- }
- }
- -d* {
- CreateMainTask
- set dom [string range $arg 2 end]
- if {![string length $dom]} {
- set dom [lindex $argv [incr i]]
- }
- lappend control($task,domains) $dom
- }
- -i* {
- set idletime [string range $arg 2 end]
- if {![string length $idletime]} {
- set idletime [lindex $argv [incr i]]
- }
- }
- -l* {
- CreateMainTask
- set acceptLanguage [string range $arg 2 end]
- if {![string length $acceptLanguage]} {
- set acceptLanguage [lindex $argv [incr i]]
- }
- }
- -r* {
- CreateMainTask
- set rfile [string range $arg 2 end]
- if {![string length $rfile]} {
- set rfile [lindex $argv [incr i]]
- }
- catch {unset maxdistance}
- source $rfile
- if {[info exists maxdistance]} {
- set control($task,distance) $maxdistance
- }
- }
- default {
- CreateMainTask
- set href $arg
- #dbgmsg "in default: arg= $arg !!!"
- loadlib
- if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
- if {![RobotFileExist $task visited $host $path]} {
- set outf [RobotFileOpen $task unvisited $host $path]
- RobotWriteRecord $outf href 0
- RobotFileClose $outf
- }
- }
- }
- }
- incr i
-}
-
-
-dbgmsg "Parsed args, now loading"
-loadlib
-
-if {![info exist robotsMax]} {
- set robotsMax 5
-}
-
-if {[info exist daemon_dir]} {
- logmsg "Daemon mode"
- RobotDaemonLoop
-} else {
- foreach t $tasks {
- logmsg "task $t"
- logmsg "max distance=$control($t,distance)"
- if {[info exists control($t,domains)]} {
- logmsg "domains=$control($t,domains)"
- }
- }
- logmsg "max jobs=$robotsMax"
-
- foreach t $tasks {
- RobotStart $t
- }
-
- while {$robotsRunning} {
- vwait robotsRunning
- }
-
- if {[info exists tasks]} {
- foreach t $tasks {
- set statusfile [open $t/status w]
- puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
- close $statusfile
- }
- }
-}
-