From: Marc Cromme Date: Thu, 14 Aug 2003 10:18:24 +0000 (+0000) Subject: no specific tkl stuff present any more X-Git-Tag: tclrobot.0.2.0~6 X-Git-Url: http://jsfdemo.indexdata.com/cgi-bin?a=commitdiff_plain;h=98fda5ea3294dda572881ad9bc26bf586fd9b30a;p=tclrobot.git no specific tkl stuff present any more --- diff --git a/debian/tkl-web-harvester.init b/debian/tkl-web-harvester.init deleted file mode 100755 index d25699d..0000000 --- a/debian/tkl-web-harvester.init +++ /dev/null @@ -1,86 +0,0 @@ -#! /bin/sh -# $Id: tkl-web-harvester.init,v 1.1 2003/08/14 08:17:05 marc Exp $ -# Start and stop tkl's tclrobot web harvester -# change runlevels using update-rc.d - -# What robot to start -BINDIR=/home/heikki/index/tclharv/tklite-utils/tcl/robot -ROBOT=$BINDIR/robot.tcl -# Where to find tkl default values -DEFAULTS="./tkl.default" ## "/etc/default/tkl" when in production!!! - -DISPLAYNAME="web harvester" -ROBOTNAME="tcl-webrobot" # file name base for logs etc - -test -x $ROBOT || exit 0 -test -f $DEFAULTS || exit 0 - -source $DEFAULTS - -# Now sourced from /etc/default/tkl -#TKL_USER="www-data" -#TKL_GROUP=www-data -#TKL_CONF_FILE=/etc/tkl.conf -#TKL_SPOOL_DIR=/var/spool/tkl -#TKL_LOG_DIR=/var/log/tkl -#TKL_PID_DIR=/var/pid/tkl -#TKL_TMP_DIR=/var/tmp/tkl -#TKL_PORTAL_DIRS - -# Specific paths -LOGFILE=$TKL_LOG_DIR/$ROBOTNAME.log -PIDFILE=$TKL_PID_DIR/$ROBOTNAME.PID -ROBOTOPTIONS=" \ - -o $LOGFILE \ - -p $PIDFILE \ - -T $TKL_TMP_DIR \ - -L $BINDIR \ - -D $TKL_SPOOL_DIR" - -# perform usual init.d daemon services -case "$1" in - start) - echo "Starting $DISPLAYNAME: " - if [ "$TKL_DEBUG" ] - then - echo Starting $ROBOT - echo with args $ROBOTOPTIONS - fi - cd $BINDIR # tcl needs to load a .so or two, from the same place... - start-stop-daemon --start \ - --pidfile $PIDFILE \ - --chuid $TKL_USER:$TKL_GROUP \ - --background \ - --exec $ROBOT -- $ROBOTOPTIONS - # disabling --quiet and --background makes debugging *much* easier! - # but they are nice to have in production - sleep 1 - if [ -f $PIDFILE ] - then - echo OK `cat $PIDFILE` - else - echo "Error - did not start" - fi - ;; - - stop) - echo "Stopping $DISPLAYNAME: " - start-stop-daemon --stop \ - --pidfile $PIDFILE \ - # --quiet - rm -f $PIDFILE - # -f more to keep silent if it isn't there! - ;; - - restart|reload|force-reload) - $0 stop - sleep 1 - $0 start - ;; - - *) - echo "Usage: /etc/init.d/tkl {start|stop|restart}" - exit 1 -esac - -exit 0 diff --git a/tkl-web-harvester b/tkl-web-harvester deleted file mode 100755 index 040da57..0000000 --- a/tkl-web-harvester +++ /dev/null @@ -1,1527 +0,0 @@ -#!/usr/bin/tclsh -# $Id: tkl-web-harvester,v 1.1 2003/08/14 08:02:10 marc Exp $ -# -set loghandle stdout -set robotsRunning 0 -set workdir [pwd] -set idletime 15000 -set acceptLanguage {} -set debuglevel 1 -set libdir "" - - -proc logmsg {msg} { - global loghandle - - puts $loghandle $msg - flush $loghandle -} - -#proc dbgmsg {level msg} { -# global debuglevel -# if {[expr $debuglevel >= $level]} { -# logmsg $msg -# } -#} -proc dbgmsg {msg} { - global debuglevel - if {[expr $debuglevel >= 0]} { - logmsg $msg - } -} -# dbgmsg is always called with just one string! - - -proc fnameEncode {fname} { - regsub -all {&} $fname {%38} fname - regsub -all {<} $fname {%3C} fname - regsub -all {>} $fname {%3E} fname - regsub -all {\?} $fname {%3F} fname - regsub -all {\*} $fname {%2A} fname - return $fname -} - -proc fnameDecode {fname} { - regsub -all {%38} $fname {&} fname - regsub -all {%3C} $fname {<} fname - regsub -all {%3E} $fname {>} fname - regsub -all {%3F} $fname {?} fname - regsub -all {%2A} $fname {*} fname - return $fname -} - -proc RobotFileNext1 {area lead} { - # dbgmsg "RobotFileNext1 area=$area lead=$lead" - if {[catch {set ns [glob ${area}/*]}]} { - return {} - } - foreach n $ns { - if {[file isfile $n]} { - set off [string last / $n] - # skip / - incr off - set end [string length $n] - # skip _.tkl - incr end -6 - return $lead/[string range $n $off $end] - } - } - foreach n $ns { - if {[file isdirectory $n]} { - set off [string last / $n] - # skip / - incr off - set sb [RobotFileNext1 $n $lead/[string range $n $off end]] - if {[string length $sb]} { - return $sb - } - } - } - return {} -} - -proc RobotWriteRecord {outf fromurl distance} { - puts $outf {} - puts $outf "" - puts $outf "" - puts $outf $distance - puts $outf "" - puts $outf "" - puts $outf $fromurl - puts $outf "" - puts $outf "" -} - -proc RobotReadRecord {inf fromurlx distancex} { - upvar $fromurlx fromurl - upvar $distancex distance - gets $inf - gets $inf - gets $inf - set distance [string trim [gets $inf]] - # dbgmsg "got distance = $distance" - gets $inf - gets $inf - set fromurl [string trim [gets $inf]] -} - -proc RobotFileNext {task area} { - global control - global idletime ns - global status - - # dbgmsg "RobotFileNext seq=$control($task,seq)" - if {$control($task,seq) < 0} { - return {} - } - set target $control($task,target) - if {$control($task,seq) == 0} { - if {[catch {set ns($task) [glob $target/$area/*]}]} { - puts "----------- DONE-------- target=$target" - return done - } - } - # dbgmsg "ns=$ns($task)" - set off [string length $target/$area] - incr off - set n [lindex $ns($task) $control($task,seq)] - # dbgmsg "n=$n" - if {![string length $n]} { - set control($task,seq) -1 - set statusfile [open $target/status w] - puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)" - close $statusfile - return wait - } - incr control($task,seq) - if {[file isfile $n/robots.txt_.tkl]} { - # dbgmsg "ok returning http://[string range $n $off end]/robots.txt" - return [fnameDecode http://[string range $n $off end]/robots.txt] - } elseif {[file isdirectory $n]} { - set sb [RobotFileNext1 $n http://[string range $n $off end]] - if {[string length $sb]} { - return [fnameDecode $sb] - } - } - dbgmsg "no more work at end of RobotFileNext n=$n" - dbgmsg "ns=$ns($task)" - return {} -} - - -proc RobotFileExist {task area host path} { - global debuglevel control - - if {$debuglevel > 3} { - dbgmsg "RobotFileExist begin area=$area host=$host path=$path" - } - set target $control($task,target) - return [file exists [fnameEncode $target/$area/$host${path}_.tkl]] -} - -proc RobotFileUnlink {task area host path} { - global status control - - set target $control($task,target) - # dbgmsg "RobotFileUnlink begin" - # dbgmsg "area=$area host=$host path=$path" - set npath [fnameEncode $target/$area/$host${path}_.tkl] - # dbgmsg "npath=$npath" - set comp [split $npath /] - if {[catch {exec rm $npath}]} return - - set l [llength $comp] - incr l -2 - incr status($task,$area) -1 - for {set i $l} {$i > 0} {incr i -1} { - set path [join [lrange $comp 0 $i] /] - if {![catch {glob $path/*}]} return - exec rmdir $path - } - # dbgmsg "RobotFileUnlink end" -} - -proc RobotFileClose {out} { - if [string compare $out stdout] { - close $out - } -} - -proc RobotFileOpen {task area host path {mode w}} { - set orgPwd [pwd] - global workdir status debuglevel control - - # dbgmsg "RobotFileOpen task=$task path=$path" - - set target $control($task,target) - set path [fnameEncode $path] - - if {![info exists workdir]} { - return stdout - } - if {$debuglevel > 3} { - dbgmsg "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode" - } - if {[string compare $orgPwd $workdir]} { - dbgmsg "ooops. RobotFileOpen failed" - dbgmsg "workdir = $workdir" - dbgmsg "pwd = $orgPwd" - exit 1 - } - - set comp [split $target/$area/$host /] - set len [llength $comp] - incr len -1 - - # dbgmsg "1 comp=$comp" - - for {set i 0} {$i <= $len} {incr i} { - set d [lindex $comp $i] - if {[string length $d] == 0} { - cd / - } elseif {[catch {cd $d}]} { - exec mkdir $d - cd ./$d - if {![string compare $area unvisited] && $i == $len && $mode == "w"} { - if {[string compare $path /robots.txt]} { - set out [open robots.txt_.tkl w] - dbgmsg "creating robots.txt in $d" - close $out - incr status($task,unvisited) - } - } - } - } - - set comp [split $path /] - set len [llength $comp] - incr len -1 - - # dbgmsg "2 path=$path comp=$comp" - - for {set i 0} {$i < $len} {incr i} { - set d [lindex $comp $i] - if {[string length $d] > 0} { - if {[catch {cd $d}]} { - exec mkdir $d - cd ./$d - } - } - } - set d [lindex $comp $len] - set out [open ${d}_.tkl $mode] - if {$mode == "w"} { - incr status($task,$area) - } - cd $orgPwd - return $out -} - -proc RobotStartJob {root task} { - global control - - set fname "$root$task" - set f [open $fname r] - set xml [read $f] - dbgmsg "Reading $fname" - close $f - # task type must be 2 - if {![regexp {([^<]*)} $xml x tasktype]} { - return - } - set tasktype [string trim $tasktype] - if {![string match 2 $tasktype]} { - return - } - # status must not be finished or error - if {![regexp {([^<]*)} $xml x status]} { - return - } - if {$status == "finished"} { - dbgmsg "already finished" - return - } - if {$status == "error"} { - dbgmsg "already finished due to error" - return - } - # ignore if task has already been processed - dbgmsg "status = $status" - if {![CreateTask $task]} { - return - } - set control($task,taskfname) $fname - dbgmsg "Reading $fname stage 2" - htmlSwitch $xml \ - url { - lappend starturls $body - } filter { - set type $parm(type) - set action $parm(action) - if {$type == "domain"} { - $action url http://$body/* - } - if {$type == "url"} { - $action url $body - } - if {$type == "mime"} { - $action mime $body - } - } target { - set ex [file rootname [file tail $task]] - #set control($task,target) "$root$body/$ex" - set control($task,target) "$control(tmpdir)/$ex" - set control($task,output) "$root$body" - } distance { - set control($task,distance) $body - } status { - set control($task,filestatus) $body - } tasktype { - set control($task,tasktype) $body - } - - if {[info exists starturls]} { - foreach url $starturls { - puts "marking start urls $url" - url $url - } - } - - if {$status == "pending"} { - regsub {[^<]*} $xml {running} xml2 - set f [open $fname w] - puts -nonewline $f $xml2 - close $f - } -} - -proc RobotDoneJob {task} { - global daemon_dir control - - if {![info exists daemon_dir]} { - return - } - set fname $control($task,taskfname) - set f [open $fname r] - set xml [read $f] - dbgmsg "Reading $fname" - regexp {([^<]*)} $xml x status - dbgmsg "------" - dbgmsg "status = $status" - close $f - - regsub {[^<]*} $xml {finished} xml2 - set f [open $fname w] - puts -nonewline $f $xml2 - close $f -} - -proc RobotScanDir {} { - global daemon_dir - - if {![info exists daemon_dir]} { - return - } - foreach d $daemon_dir { - if {[catch {set files [glob $d/*.spl]}]} { - return - } - foreach fname $files { - if {[file isfile $fname] && [file readable $fname]} { - set jobfile [open $fname] - gets $jobfile portalroot - gets $jobfile portaltask - close $jobfile - - RobotStartJob $portalroot $portaltask - } - } - } -} - -proc RobotRR {task} { - global control robotsRunning tasks robotsMax status - - dbgmsg "RobotRR -- running=$robotsRunning max=$robotsMax---------------" - incr robotsRunning -1 - - # only one task gets through... - if {[string compare [lindex $tasks 0] $task]} { - return - } - dbgmsg "RobotRR. task = $task" - while {$robotsRunning} { - vwait robotsRunning - } - dbgmsg "Scan" - if {[catch {RobotScanDir} msg]} { - logmsg "RobotScanDir failed" - logmsg $msg - } - foreach t $tasks { - set target $control($t,target) - set statusfile [open $target/status w] - puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)" - close $statusfile - set control($t,seq) 0 - RobotStart $t - } -} - -proc RobotDaemonSig {} { - global daemon_cnt - - incr daemon_cnt -} - -proc RobotDaemonLoop {} { - global daemon_cnt tasks robotsRunning status - - set daemon_cnt 0 - while 1 { - logmsg $daemon_cnt - RobotScanDir - - if {[info exists tasks]} { - logmsg "daemon loop tasks $tasks" - foreach t $tasks { - set control($t,seq) 0 - RobotStart $t - } - while {$robotsRunning} { - vwait robotsRunning - } - } - after 30000 RobotDaemonSig - vwait daemon_cnt - } -} - -proc RobotRestart {task url sock} { - global URL robotsRunning - - close $sock - after cancel $URL($sock,cancel) - - foreach v [array names URL $task,$url,*] { - unset URL($v) - } - - incr robotsRunning -1 - RobotStart $task -} - -proc RobotStart {task} { - global URL - global robotsRunning robotsMax idletime status tasks - - # dbgmsg "RobotStart $task running=$robotsRunning" - while {1} { - set url [RobotFileNext $task unvisited] - if {[string compare $url done] == 0} { - dbgmsg "In RobotStart task $task done" - - catch {unset ntasks} - foreach t $tasks { - if {[string compare $t $task]} { - lappend ntasks $t - } else { - dbgmsg "task $t done" - } - } - if {![info exists ntasks]} { - unset tasks - dbgmsg "all done" - } else { - set tasks $ntasks - } - RobotDoneJob $task - return - } - if {![string length $url]} { - return - } - incr robotsRunning - if {[string compare $url wait] == 0} { - after $idletime [list RobotRR $task] - return - } - set r [RobotGetUrl $task $url {}] - if {!$r} { - if {$robotsRunning >= $robotsMax} return - } else { - incr robotsRunning -1 - if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { - set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] - RobotFileClose $outf - } - RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) - } - } -} - -proc headSave {task url out} { - global URL - - if {[info exists URL($task,$url,head,last-modified)]} { - puts $out "$URL($task,$url,head,last-modified)" - } - puts $out {} - if {[info exists URL($task,$url,head,date)]} { - puts $out " $URL($task,$url,head,date)" - } - if {[info exists URL($task,$url,head,content-length)]} { - puts $out " $URL($task,$url,head,content-length)" - } - if {[info exists URL($task,$url,head,server)]} { - puts $out " $URL($task,$url,head,server)" - } - puts $out {} - puts $out {} - puts $out " $url" - if {[info exists URL($task,$url,head,content-type)]} { - puts $out " $URL($task,$url,head,content-type)" - } - puts $out {} -} - -proc RobotHref {task url hrefx hostx pathx} { - global URL control debuglevel - upvar $hrefx href - upvar $hostx host - upvar $pathx path - - if {$debuglevel > 1} { - dbgmsg "Ref input url = $url href=$href" - } - - if {[string first { } $href] >= 0} { - return 0 - } - if {[string length $href] > 256} { - return 0 - } - -# Skip pages that have ? in them -# if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} { -# return 0 -# } - # get method (if any) - if {![regexp {^([^/:]+):(.*)} $href x method hpath]} { - set hpath $href - set method http - } else { - if {[string compare $method http]} { - return 0 - } - } - # get host (if any) - if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} { - if {![string length $surl]} { - set surl / - } - if {[info exist control($task,domains)]} { - set ok 0 - foreach domain $control($task,domains) { - if {[string match $domain $host]} { - set ok 1 - break - } - } - if {!$ok} { - return 0 - } - } - } else { - regexp {^([^\#]*)} $hpath x surl - set host $URL($task,$url,hostport) - } - if {![string length $surl]} { - return 0 - } - if {[string first / $surl]} { - # relative path - set curpath $URL($task,$url,path) - if {[info exists URL($task,$url,bpath)]} { - set curpath $URL($task,$url,bpath) - } - regexp {^([^\#?]*)} $curpath x dpart - set l [string last / $dpart] - if {[expr $l >= 0]} { - set surl [string range $dpart 0 $l]$surl - } else { - set surl $dpart/$surl - } - } - set surllist [split $surl /] - catch {unset path} - set pathl 0 - foreach c $surllist { - switch -- $c { - .. { - if {$pathl > 1} { - incr pathl -2 - set path [lrange $path 0 $pathl] - incr pathl - } - } - . { - - } - default { - incr pathl - lappend path $c - } - } - } - if {$debuglevel > 4} { - dbgmsg "pathl=$pathl output path=$path" - } - set path [join $path /] - if {![string length $path]} { - set path / - } - regsub -all {~} $path {%7E} path - set href "$method://$host$path" - - if {$debuglevel > 1} { - dbgmsg "Ref result = $href" - } - return [checkrule $task url $href] -} - -proc RobotError {task url code} { - global URL - - dbgmsg "Bad URL $url (code $code)" - set fromurl {} - set distance -1 - if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { - set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] - RobotReadRecord $inf fromurl distance - RobotFileClose $inf - } - RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) - if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { - set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] - RobotWriteRecord $outf $fromurl $distance - RobotFileClose $outf - } -} - -proc RobotRedirect {task url tourl code} { - global URL - - dbgmsg "Redirecting from $url to $tourl" - - set distance {} - set fromurl {} - if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { - set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] - RobotReadRecord $inf fromurl distance - RobotFileClose $inf - } - if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} { - set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)] - RobotWriteRecord $outf $fromurl $distance - RobotFileClose $outf - } - if {[RobotHref $task $url tourl host path]} { - if {![RobotFileExist $task visited $host $path]} { - if {![RobotFileExist $task unvisited $host $path]} { - set outf [RobotFileOpen $task unvisited $host $path] - RobotWriteRecord $outf $fromurl $distance - RobotFileClose $outf - } - } else { - set olddistance {} - set inf [RobotFileOpen $task visited $host $path r] - RobotReadRecord $inf oldurl olddistance - RobotFileClose $inf - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[string length $distance] == 0} { - set distance 1000 - } - dbgmsg "distance=$distance olddistance=$olddistance" - if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen $task unvisited $host $path] - RobotWriteRecord $outf $tourl $distance - RobotFileClose $outf - } - } - } - if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} { - dbgmsg "unlink failed" - exit 1 - } -} - -proc wellform {body} { - regsub -all {} $body { } abody - regsub -all -nocase {} $abody {} body - regsub -all {<[^\>]+>} $body {} abody - regsub -all { } $abody { } body - regsub -all {&} $body {&} abody - return $abody -} - -proc link {task url out href body distance} { - global URL control - if {[expr $distance > $control($task,distance)]} return - - if {![RobotHref $task $url href host path]} return - - if ($control($task,cr)) { - puts $out "" - puts $out "$href" - set abody [wellform $body] - puts $out "$abody" - puts $out "" - } - - if {![RobotFileExist $task visited $host $path]} { - set olddistance 1000 - if {![RobotFileExist $task bad $host $path]} { - if {[RobotFileExist $task unvisited $host $path]} { - set inf [RobotFileOpen $task unvisited $host $path r] - RobotReadRecord $inf oldurl olddistance - RobotFileClose $inf - } - } else { - set olddistance 0 - } - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - set outf [RobotFileOpen $task unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } elseif {[string compare $href $url]} { - set inf [RobotFileOpen $task visited $host $path r] - RobotReadRecord $inf xurl olddistance - close $inf - if {[string length $olddistance] == 0} { - set olddistance 1000 - } - if {[expr $distance < $olddistance]} { - dbgmsg "OK remarking url=$url href=$href" - dbgmsg "olddistance = $olddistance" - dbgmsg "newdistance = $distance" - set outf [RobotFileOpen $task unvisited $host $path] - RobotWriteRecord $outf $url $distance - RobotFileClose $outf - } - } -} - -proc RobotTextTkl {task url out} { - global URL control - - # set title so we can emit it for the body - set title {} - # if true, nothing will be indexed - set noindex 0 - # if true, nothing will be followed - set nofollow 0 - - puts $control($task,output) - - set out stdout - set distance distance - - htmlSwitch $URL($task,$url,buf) \ - title { - # når title tag er hittet, er body set til indholdet af tagget - set title $body - } -nonest meta { - #puts -nonewline $out "" - } body { - # don't print title of document content if noindex is used - if {!$noindex} { - #puts $out "$title" - # xml compilancy added - set bbody [wellform $body] - #puts $out "" - #puts $out $bbody - #puts $out "" - } - } -nonest base { - # - if {![info exists parm(href)]} { - continue - } - set href [string trim $parm(href)] - } a { - # .. - # we're not using nonest - otherwise body isn't set - if {$nofollow} continue - if {![info exists parm(href)]} { - continue - } - #puts "link $task $url $out [string trim $parm(href)] $body $distance" - } -nonest area { - if {$nofollow} continue - if {![info exists parm(href)]} { - continue - } - #puts "link $task $url $out [string trim $parm(href)] $body $distance" - } -nonest frame { - if {![info exists parm(src)]} { - continue - } - #puts "link $task $url $out [string trim $parm(src)] $body $fdistance" - } -} - -proc RobotTextHtml {task url out} { - global URL control - - # set title so we can emit it for the body - set title {} - # if true, nothing will be indexed - set noindex 0 - # if true, nothing will be followed - set nofollow 0 - - set distance 0 - set fdistance 0 - if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} { - set fdistance $URL($task,$url,dist) - set distance [expr $fdistance + 1] - } - htmlSwitch $URL($task,$url,buf) \ - title { - set title $body - } -nonest meta { - # collect metadata and save NAME= CONTENT=.. - set metaname {} - set metacontent {} - puts -nonewline $out "" - # go through robots directives (af any) - if {![string compare $metaname robots]} { - set direcs [split [string tolower $metacontent] ,] - if {[lsearch $direcs noindex] >= 0} { - set noindex 1 - } - if {[lsearch $direcs nofollow] >= 0} { - set nofollow 1 - } - } - } body { - # don't print title of document content if noindex is used - if {!$noindex} { - puts $out "$title" - set bbody [wellform $body] - puts $out "" - puts $out $bbody - puts $out "" - } - } -nonest base { - # - if {![info exists parm(href)]} { - continue - } - set href [string trim $parm(href)] - if {![RobotHref $task $url href host path]} continue - set URL($task,$url,bpath) $path - } a { - # .. - # we're not using nonest - otherwise body isn't set - if {$nofollow} continue - if {![info exists parm(href)]} { - continue - } - link $task $url $out [string trim $parm(href)] $body $distance - } -nonest area { - if {$nofollow} continue - if {![info exists parm(href)]} { - continue - } - link $task $url $out [string trim $parm(href)] $body $distance - } -nonest frame { - if {![info exists parm(src)]} { - continue - } - link $task $url $out [string trim $parm(src)] $body $fdistance - } -} - -proc RobotsTxt {task url} { - global agent URL - - RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf) -} - -proc RobotsTxt0 {task v buf} { - global URL agent - set section 0 - foreach l [split $buf \n] { - if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} { - set arg [string trim $arg] - dbgmsg "cmd=$cmd arg=$arg" - switch -- [string tolower $cmd] { - user-agent { - if {$section} break - set pat [string tolower $arg]* - set section [string match $pat $agent] - } - disallow { - if {$section} { - dbgmsg "rule [list 0 $arg]" - lappend $v [list 0 $arg] - } - } - allow { - if {$section} { - dbgmsg "rule [list 1 $arg]" - lappend $v [list 1 $arg] - } - } - } - } - } -} - -proc RobotTextPlain {task url out} { - global URL - - puts $out "" - regsub -all {<} $URL($task,$url,buf) {\<} content - puts $out $content - puts $out "" - - if {![string compare $URL($task,$url,path) /robots.txt]} { - RobotsTxt $task $url - } -} - -proc RobotWriteMetadata {task url out} { - global URL - - set charset $URL($task,$url,charset) - puts $out "" - puts $out "" - - set distance 1000 - if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} { - set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r] - RobotReadRecord $inf fromurl distance - RobotFileClose $inf - } - set URL($task,$url,dist) $distance - puts $out "" - puts $out " $distance" - puts $out "" - headSave $task $url $out - logmsg "Parsing $url distance=$distance" - switch $URL($task,$url,head,content-type) { - text/html { - if {[string length $distance]} { - RobotTextHtml $task $url $out - RobotTextTkl $task $url $out - } - } - text/plain { - RobotTextPlain $task $url $out - } - } - puts $out "" -} - -proc Robot200 {task url} { - global URL - - set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)] - puts -nonewline $out $URL($task,$url,buf) - RobotFileClose $out - - set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)] - RobotWriteMetadata $task $url $out - RobotFileClose $out - - RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) -} - -proc RobotReadContent {task url sock binary} { - global URL - - set buffer [read $sock 16384] - set readCount [string length $buffer] - - if {$readCount <= 0} { - Robot200 $task $url - RobotRestart $task $url $sock - } elseif {!$binary && [string first \0 $buffer] >= 0} { - Robot200 $task $url - RobotRestart $task $url $sock - } else { - # dbgmsg "Got $readCount bytes" - set URL($task,$url,buf) $URL($task,$url,buf)$buffer - } -} - -proc RobotReadHeader {task url sock} { - global URL debuglevel - - if {$debuglevel > 1} { - dbgmsg "HTTP head $url" - } - if {[catch {set buffer [read $sock 2148]}]} { - RobotError $task $url 404 - RobotRestart $task $url $sock - return - } - set readCount [string length $buffer] - - if {$readCount <= 0} { - RobotError $task $url 404 - RobotRestart $task $url $sock - } else { - # dbgmsg "Got $readCount bytes" - set URL($task,$url,buf) $URL($task,$url,buf)$buffer - - set n [string first \r\n\r\n $URL($task,$url,buf)] - if {$n > 1} { - set code 0 - set version {} - set headbuf [string range $URL($task,$url,buf) 0 $n] - incr n 4 - set URL($task,$url,charset) ISO-8859-1 - set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end] - - regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code - set lines [split $headbuf \n] - foreach line $lines { - if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} { - set URL($task,$url,head,[string tolower $name]) [string trim $value] - } - regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset) - } - dbgmsg "HTTP CODE $code" - set URL($task,$url,state) skip - switch $code { - 301 { - RobotRedirect $task $url $URL($task,$url,head,location) 301 - RobotRestart $task $url $sock - } - 302 { - RobotRedirect $task $url $URL($task,$url,head,location) 302 - RobotRestart $task $url $sock - } - 200 { - if {![info exists URL($task,$url,head,content-type)]} { - set URL($task,$url,head,content-type) {} - } - set binary 1 - switch -glob -- $URL($task,$url,head,content-type) { - text/* { - set binary 0 - } - } - if {![regexp {/robots.txt$} $url]} { - if {![checkrule $task mime $URL($task,$url,head,content-type)]} { - RobotError $task $url mimedeny - RobotRestart $task $url $sock - return - } - } - fileevent $sock readable [list RobotReadContent $task $url $sock $binary] - } - default { - RobotError $task $url $code - RobotRestart $task $url $sock - } - } - } - } -} - -proc RobotSockCancel {task url sock} { - - logmsg "RobotSockCancel sock=$sock url=$url" - RobotError $task $url 401 - RobotRestart $task $url $sock -} - -proc RobotConnect {task url sock} { - global URL agent acceptLanguage - - fconfigure $sock -translation {lf crlf} -blocking 0 - fileevent $sock readable [list RobotReadHeader $task $url $sock] - puts $sock "GET $URL($task,$url,path) HTTP/1.0" - puts $sock "Host: $URL($task,$url,host)" - puts $sock "User-Agent: $agent" - if {[string length $acceptLanguage]} { - puts $sock "Accept-Language: $acceptLanguage" - } - puts $sock "" - set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]] - if {[catch {flush $sock}]} { - RobotError $task $url 404 - RobotRestart $task $url $sock - } -} - -proc RobotNop {} { - -} - -proc RobotGetUrl {task url phost} { - global URL robotsRunning - flush stdout - dbgmsg "Retrieve running=$robotsRunning url=$url task=$task" - if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} { - return -1 - } - if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} { - set port 80 - set host $hostport - } - set URL($task,$url,method) $method - set URL($task,$url,host) $host - set URL($task,$url,hostport) $hostport - set URL($task,$url,path) $path - set URL($task,$url,state) head - set URL($task,$url,buf) {} - - if {[string compare $path /robots.txt]} { - set ok 1 - if {![info exists URL($hostport,robots)]} { - dbgmsg "READING robots.txt for host $hostport" - if {[RobotFileExist $task visited $hostport /robots.txt]} { - set inf [RobotFileOpen $task visited $hostport /robots.txt r] - set buf [read $inf 32768] - close $inf - } else { - set buf "User-agent: *\nAllow: /\n" - } - RobotsTxt0 $task URL($hostport,robots) $buf - } - if {[info exists URL($hostport,robots)]} { - foreach l $URL($hostport,robots) { - if {[string first [lindex $l 1] $path] == 0} { - set ok [lindex $l 0] - break - } - } - } - if {!$ok} { - dbgmsg "skipped due to robots.txt" - return -1 - } - } - if [catch {set sock [socket -async $host $port]}] { - return -1 - } - RobotConnect $task $url $sock - - return 0 -} - -proc loadlib {} { - global libdir - - if {![llength [info commands htmlSwitch]]} { - if {[info exists env(tclrobot_lib)]} { - set d $env(tclrobot_lib) - } else { - if { $libdir > "" } { - set d $libdir - } else { - set d . - } - } - set e [info sharedlibextension] - dbgmsg "About to load $d/tclrobot$e" - if {[catch {load $d/tclrobot$e}]} { - dbgmsg "Didn't get at $d, trying directly" - load tclrobot$e - } - dbgmsg "Loaded tclrobot$e all right" - } -} - -set agent "zmbot/0.2" -if {![catch {set os [exec uname -s -r]}]} { - set agent "$agent ($os)" -} - -dbgmsg "agent: $agent" - -proc bgerror {m} { - global errorInfo - dbgmsg "BGERROR $m" - dbgmsg $errorInfo -} - -# Rules: allow, deny, url - -proc checkrule {task type this} { - global control - global debuglevel - - set default_ret 1 - - if {$debuglevel > 3} { - dbgmsg "CHECKRULE $type $this" - } - if {[info exist control($task,alrules)]} { - foreach l $control($task,alrules) { - if {$debuglevel > 3} { - dbgmsg "consider $l" - } - # consider type - if {[lindex $l 1] != $type} continue - # consider mask (! negates) - set masks [lindex $l 2] - set ok 0 - set default_ret 0 - foreach mask $masks { - if {$debuglevel > 4} { - dbgmsg "consider single mask $mask" - } - if {[string index $mask 0] == "!"} { - set mask [string range $mask 1 end] - if {[string match $mask $this]} continue - } else { - if {![string match $mask $this]} continue - } - set ok 1 - } - if {$debuglevel > 4} { - dbgmsg "ok = $ok" - } - if {!$ok} continue - # OK, we have a match - if {[lindex $l 0] == "allow"} { - if {$debuglevel > 3} { - dbgmsg "CHECKRULE MATCH OK" - } - return 1 - } else { - if {$debuglevel > 3} { - dbgmsg "CHECKFULE MATCH FAIL" - } - return 0 - } - } - } - if {$debuglevel > 3} { - dbgmsg "CHECKRULE MATCH DEFAULT $default_ret" - } - return $default_ret -} - - -proc url {href} { - global debuglevel task - - if {[RobotHref $task http://www.indexdata.dk/ href host path]} { - if {![RobotFileExist $task visited $host $path]} { - set outf [RobotFileOpen $task unvisited $host $path] - RobotWriteRecord $outf href 0 - RobotFileClose $outf - } - } -} - -proc deny {type stuff} { - global control task - - lappend control($task,alrules) [list deny $type $stuff] -} - -proc allow {type stuff} { - global control task - - lappend control($task,alrules) [list allow $type $stuff] -} - -proc debug {level} { - global debuglevel - - set debuglevel $level -} - -proc CreateTask {t} { - global tasks task status control - - set task $t - - if {[info exists tasks]} { - if {[lsearch -exact $tasks $t] >= 0} { - return 0 - } - } - - lappend tasks $t - set status($t,unvisited) 0 - set status($t,visited) 0 - set status($t,bad) 0 - set status($t,raw) 0 - set status($t,active) 1 - set control($t,seq) 0 - set control($t,distance) 10 - set control($t,target) tmp - set control($t,output) output - set control($t,cr) 0 - return 1 -} - -# Little utility that ensures that at least one task is present (main). -proc CreateMainTask {} { - global tasks - if {![info exist tasks]} { - CreateTask main - } -} - -# Parse options - -set i 0 -set l [llength $argv] - -if {$l < 1} { - puts {tclrobot: usage:} - puts {tclrobot [-j jobs] [-p pid] [-T tmpdir] [-o logfile] [-i idle] [-c - count] [-d domain] [-D spooldir] [-r rules] [-L libdir] [url ..]} - logmsg " Example: -c 3 -d '*.dk' http://www.indexdata.dk/" - - exit 1 -} - - -while {$i < $l} { - set arg [lindex $argv $i] - switch -glob -- $arg { - -o* { - set fname [string range $arg 2 end] - if {![string length $fname]} { - set fname [lindex $argv [incr i]] - } - set loghandle [open $fname a] - #dbgmsg "agent: $agent" - #dbgmsg "-o $fname" - } - -p* { - set pidfname [string range $arg 2 end] - if {![string length $pidfname]} { - set pidfname [lindex $argv [incr i]] - } - #dbgmsg "-p $pidfname" - if {[file exists $pidfname]} { - set pf [open $pidfname] - gets $pf oldpid - close $pf - logmsg "File $pidfname already exist. pid=$oldpid" - if {[file isdirectory /proc/$oldpid]} { - logmsg "And it's apparently running. Exiting." - exit 1 - } - } - set pf [open $pidfname w] - puts $pf [pid] - close $pf - } - -T* { - set tmpdir [string range $arg 2 end] - if {![string length $tmpdir]} { - set tmpdir [lindex $argv [incr i]] - } - set control(tmpdir) $tmpdir - } - -L* { - set libdir [string range $arg 2 end] - if {![string length $libdir]} { - set libdir [lindex $argv [incr i]] - } - } - -t* { - set t [string range $arg 2 end] - if {![string length $t]} { - set t [lindex $argv [incr i]] - } - CreateTask $t - } - -D* { - set dir [string range $arg 2 end] - if {![string length $dir]} { - set dir [lindex $argv [incr i]] - } - lappend daemon_dir $dir - } - -j* { - set robotsMax [string range $arg 2 end] - if {![string length $robotsMax]} { - set robotsMax [lindex $argv [incr i]] - } - } - -c* { - CreateMainTask - set control($task,distance) [string range $arg 2 end] - if {![string length $control($task,distance)]} { - set control($task,distance) [lindex $argv [incr i]] - } - } - -d* { - CreateMainTask - set dom [string range $arg 2 end] - if {![string length $dom]} { - set dom [lindex $argv [incr i]] - } - lappend control($task,domains) $dom - } - -i* { - set idletime [string range $arg 2 end] - if {![string length $idletime]} { - set idletime [lindex $argv [incr i]] - } - } - -l* { - CreateMainTask - set acceptLanguage [string range $arg 2 end] - if {![string length $acceptLanguage]} { - set acceptLanguage [lindex $argv [incr i]] - } - } - -r* { - CreateMainTask - set rfile [string range $arg 2 end] - if {![string length $rfile]} { - set rfile [lindex $argv [incr i]] - } - catch {unset maxdistance} - source $rfile - if {[info exists maxdistance]} { - set control($task,distance) $maxdistance - } - } - default { - CreateMainTask - set href $arg - #dbgmsg "in default: arg= $arg !!!" - loadlib - if {[RobotHref $task http://www.indexdata.dk/ href host path]} { - if {![RobotFileExist $task visited $host $path]} { - set outf [RobotFileOpen $task unvisited $host $path] - RobotWriteRecord $outf href 0 - RobotFileClose $outf - } - } - } - } - incr i -} - - -dbgmsg "Parsed args, now loading" -loadlib - -if {![info exist robotsMax]} { - set robotsMax 5 -} - -if {[info exist daemon_dir]} { - logmsg "Daemon mode" - RobotDaemonLoop -} else { - foreach t $tasks { - logmsg "task $t" - logmsg "max distance=$control($t,distance)" - if {[info exists control($t,domains)]} { - logmsg "domains=$control($t,domains)" - } - } - logmsg "max jobs=$robotsMax" - - foreach t $tasks { - RobotStart $t - } - - while {$robotsRunning} { - vwait robotsRunning - } - - if {[info exists tasks]} { - foreach t $tasks { - set statusfile [open $t/status w] - puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)" - close $statusfile - } - } -} -