tcl web harvesting script for tkl project added
authorMarc Cromme <marc@indexdata.dk>
Thu, 14 Aug 2003 08:02:10 +0000 (08:02 +0000)
committerMarc Cromme <marc@indexdata.dk>
Thu, 14 Aug 2003 08:02:10 +0000 (08:02 +0000)
tkl-web-harvester [new file with mode: 0755]

diff --git a/tkl-web-harvester b/tkl-web-harvester
new file mode 100755 (executable)
index 0000000..040da57
--- /dev/null
@@ -0,0 +1,1527 @@
+#!/usr/bin/tclsh 
+# $Id: tkl-web-harvester,v 1.1 2003/08/14 08:02:10 marc Exp $
+#
+set loghandle stdout
+set robotsRunning 0
+set workdir [pwd]
+set idletime 15000
+set acceptLanguage {}
+set debuglevel 1
+set libdir ""
+
+
+proc logmsg {msg} {
+    global loghandle
+
+    puts $loghandle $msg
+    flush $loghandle
+}
+
+#proc dbgmsg {level msg} {
+#    global debuglevel
+#    if {[expr $debuglevel >= $level]} {
+#        logmsg $msg
+#    }
+#}
+proc dbgmsg {msg} {
+    global debuglevel
+    if {[expr $debuglevel >= 0]} {
+        logmsg $msg
+    }
+}
+# dbgmsg is always called with just one string!
+
+
+proc fnameEncode {fname} {
+    regsub -all {&} $fname {%38} fname
+    regsub -all {<} $fname {%3C} fname
+    regsub -all {>} $fname {%3E} fname
+    regsub -all {\?} $fname {%3F} fname
+    regsub -all {\*} $fname {%2A} fname
+    return $fname
+}
+
+proc fnameDecode {fname} {
+    regsub -all {%38} $fname {&} fname
+    regsub -all {%3C} $fname {<} fname
+    regsub -all {%3E} $fname {>} fname
+    regsub -all {%3F} $fname {?} fname
+    regsub -all {%2A} $fname {*} fname
+    return $fname
+}
+
+proc RobotFileNext1 {area lead} {
+    # dbgmsg "RobotFileNext1 area=$area lead=$lead"
+    if {[catch {set ns [glob ${area}/*]}]} {
+        return {}
+    }
+    foreach n $ns {
+       if {[file isfile $n]} {
+            set off [string last / $n]
+           # skip /
+           incr off
+           set end [string length $n]
+           # skip _.tkl
+           incr end -6
+            return $lead/[string range $n $off $end]
+        }
+    }
+    foreach n $ns {
+       if {[file isdirectory $n]} {
+            set off [string last / $n]
+           # skip /
+           incr off
+            set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
+            if {[string length $sb]} {
+                return $sb
+            }
+        }
+    }
+    return {}
+}
+
+proc RobotWriteRecord {outf fromurl distance} {
+    puts $outf {<?xml version="1.0" encoding="ISO-8859-1" standalone="yes"?>}
+    puts $outf "<zmbot>"
+    puts $outf "<distance>"
+    puts $outf $distance
+    puts $outf "</distance>"
+    puts $outf "<fromurl>"
+    puts $outf $fromurl
+    puts $outf "</fromurl>"
+    puts $outf "</zmbot>"
+}
+
+proc RobotReadRecord {inf fromurlx distancex} {
+    upvar $fromurlx fromurl
+    upvar $distancex distance
+    gets $inf
+    gets $inf
+    gets $inf
+    set distance [string trim [gets $inf]]
+    # dbgmsg "got distance = $distance"
+    gets $inf
+    gets $inf
+    set fromurl [string trim [gets $inf]]
+}
+
+proc RobotFileNext {task area} {
+    global control
+    global idletime ns
+    global status
+
+    # dbgmsg "RobotFileNext seq=$control($task,seq)"
+    if {$control($task,seq) < 0} {
+       return {}
+    }
+    set target $control($task,target)
+    if {$control($task,seq) == 0} {
+       if {[catch {set ns($task) [glob $target/$area/*]}]} {
+           puts "----------- DONE-------- target=$target"
+           return done
+       }
+    }
+    # dbgmsg "ns=$ns($task)"
+    set off [string length $target/$area]
+    incr off
+    set n [lindex $ns($task) $control($task,seq)]
+    # dbgmsg "n=$n"
+    if {![string length $n]} {
+       set control($task,seq) -1
+        set statusfile [open $target/status w]
+        puts $statusfile "$status($task,unvisited) $status($task,bad) $status($task,visited)"
+        close $statusfile
+       return wait
+    }
+    incr control($task,seq)
+    if {[file isfile $n/robots.txt_.tkl]} {
+       # dbgmsg "ok returning http://[string range $n $off end]/robots.txt"
+       return [fnameDecode http://[string range $n $off end]/robots.txt]
+    } elseif {[file isdirectory $n]} {
+       set sb [RobotFileNext1 $n http://[string range $n $off end]]
+       if {[string length $sb]} {
+           return [fnameDecode $sb]
+       }
+    }
+    dbgmsg "no more work at end of RobotFileNext n=$n"
+    dbgmsg "ns=$ns($task)"
+    return {}
+}
+
+
+proc RobotFileExist {task area host path} {
+    global debuglevel control
+
+    if {$debuglevel > 3} {
+        dbgmsg "RobotFileExist begin area=$area host=$host path=$path"
+    }
+    set target $control($task,target)
+    return [file exists [fnameEncode $target/$area/$host${path}_.tkl]]
+}
+
+proc RobotFileUnlink {task area host path} {
+    global status control
+
+    set target $control($task,target)
+    # dbgmsg "RobotFileUnlink begin"
+    # dbgmsg "area=$area host=$host path=$path"
+    set npath [fnameEncode $target/$area/$host${path}_.tkl]
+    # dbgmsg "npath=$npath"
+    set comp [split $npath /]
+    if {[catch {exec rm $npath}]} return
+
+    set l [llength $comp]
+    incr l -2
+    incr status($task,$area) -1
+    for {set i $l} {$i > 0} {incr i -1} {
+        set path [join [lrange $comp 0 $i] /]
+       if {![catch {glob $path/*}]} return
+        exec rmdir $path
+    }
+    # dbgmsg "RobotFileUnlink end"
+}
+
+proc RobotFileClose {out} {
+    if [string compare $out stdout] {
+       close $out
+    }
+}
+
+proc RobotFileOpen {task area host path {mode w}} {
+    set orgPwd [pwd]
+    global workdir status debuglevel control
+
+    # dbgmsg "RobotFileOpen task=$task path=$path"
+
+    set target $control($task,target)
+    set path [fnameEncode $path]
+
+    if {![info exists workdir]} {
+       return stdout
+    }
+    if {$debuglevel > 3} {
+        dbgmsg "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path mode=$mode"
+    }
+    if {[string compare $orgPwd $workdir]} {
+        dbgmsg "ooops. RobotFileOpen failed"
+       dbgmsg "workdir = $workdir"
+       dbgmsg "pwd = $orgPwd"
+       exit 1
+    }
+
+    set comp [split $target/$area/$host /]
+    set len [llength $comp]
+    incr len -1
+
+    # dbgmsg "1 comp=$comp"
+
+    for {set i 0} {$i <= $len} {incr i} {
+        set d [lindex $comp $i]
+       if {[string length $d] == 0} {
+           cd /
+       } elseif {[catch {cd $d}]} {
+            exec mkdir $d
+            cd ./$d
+           if {![string compare $area unvisited] && $i == $len && $mode == "w"} {
+               if {[string compare $path /robots.txt]} {
+                   set out [open robots.txt_.tkl w]
+                   dbgmsg "creating robots.txt in $d"
+                   close $out
+                    incr status($task,unvisited)
+               }
+           }
+        }
+    }
+
+    set comp [split $path /]
+    set len [llength $comp]
+    incr len -1
+
+    # dbgmsg "2 path=$path comp=$comp"
+
+    for {set i 0} {$i < $len} {incr i} {
+        set d [lindex $comp $i]
+        if {[string length $d] > 0} {
+            if {[catch {cd $d}]} {
+                exec mkdir $d
+                cd ./$d
+            }
+        }
+    }
+    set d [lindex $comp $len]
+    set out [open ${d}_.tkl $mode]
+    if {$mode == "w"} {
+        incr status($task,$area)
+    }
+    cd $orgPwd
+    return $out
+}
+
+proc RobotStartJob {root task} {
+    global control
+
+    set fname "$root$task"
+    set f [open $fname r]
+    set xml [read $f]
+    dbgmsg "Reading $fname"
+    close $f
+    # task type must be 2
+    if {![regexp {<tasktype>([^<]*)</tasktype>} $xml x tasktype]} {
+       return
+    }
+    set tasktype [string trim $tasktype]
+    if {![string match 2 $tasktype]} {
+       return
+    }
+    # status must not be finished or error 
+    if {![regexp {<status>([^<]*)</status>} $xml x status]} {
+       return
+    }
+    if {$status == "finished"} {
+        dbgmsg "already finished"
+        return
+    }
+    if {$status == "error"} {
+        dbgmsg "already finished due to error"
+        return
+    }
+    # ignore if task has already been processed
+    dbgmsg "status = $status"
+    if {![CreateTask $task]} {
+        return
+    }
+    set control($task,taskfname) $fname
+    dbgmsg "Reading $fname stage 2"
+    htmlSwitch $xml \
+        url {
+           lappend starturls $body
+        } filter {
+            set type $parm(type)
+            set action $parm(action)
+            if {$type == "domain"} {
+                $action url http://$body/*
+            }
+            if {$type == "url"} {
+                $action url $body
+            }
+            if {$type == "mime"} {
+                $action mime $body
+            }
+       } target {
+           set ex [file rootname [file tail $task]]
+           #set control($task,target) "$root$body/$ex"
+           set control($task,target) "$control(tmpdir)/$ex"
+           set control($task,output) "$root$body"
+        } distance {
+            set control($task,distance) $body
+        } status {
+            set control($task,filestatus) $body
+        } tasktype {
+           set control($task,tasktype) $body
+       }
+    
+    if {[info exists starturls]} {
+       foreach url $starturls {
+           puts "marking start urls $url"
+           url $url
+       }
+    }
+
+    if {$status == "pending"} {
+        regsub {<status>[^<]*</status>} $xml {<status>running</status>} xml2
+        set f [open $fname w]
+        puts -nonewline $f $xml2 
+        close $f
+    }
+}
+
+proc RobotDoneJob {task} {
+    global daemon_dir control
+
+    if {![info exists daemon_dir]} {
+        return
+    }
+    set fname $control($task,taskfname)
+    set f [open $fname r]
+    set xml [read $f]
+    dbgmsg "Reading $fname"
+    regexp {<status>([^<]*)</status>} $xml x status
+    dbgmsg "------"
+    dbgmsg "status = $status"
+    close $f
+
+    regsub {<status>[^<]*</status>} $xml {<status>finished</status>} xml2
+    set f [open $fname w]
+    puts -nonewline $f $xml2 
+    close $f
+}
+
+proc RobotScanDir {} {
+    global daemon_dir
+
+    if {![info exists daemon_dir]} {
+        return
+    }
+    foreach d $daemon_dir {
+        if {[catch {set files [glob $d/*.spl]}]} {
+            return
+        }
+        foreach fname $files {
+            if {[file isfile $fname] && [file readable $fname]} {
+               set jobfile [open $fname]
+               gets $jobfile portalroot
+               gets $jobfile portaltask
+               close $jobfile
+               
+                RobotStartJob $portalroot $portaltask
+            }
+        }
+    }
+}
+
+proc RobotRR {task} {
+    global control robotsRunning tasks robotsMax status
+
+    dbgmsg "RobotRR -- running=$robotsRunning max=$robotsMax---------------"
+    incr robotsRunning -1
+
+    # only one task gets through...
+    if {[string compare [lindex $tasks 0] $task]} {
+        return
+    }
+    dbgmsg "RobotRR. task = $task"
+    while {$robotsRunning} {
+       vwait robotsRunning
+    }
+    dbgmsg "Scan"
+    if {[catch {RobotScanDir} msg]} {
+        logmsg "RobotScanDir failed"
+        logmsg $msg
+    }
+    foreach t $tasks {
+       set target $control($t,target)
+        set statusfile [open $target/status w]
+        puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
+        close $statusfile
+        set control($t,seq) 0
+        RobotStart $t
+    }
+}
+
+proc RobotDaemonSig {} {
+    global daemon_cnt
+
+    incr daemon_cnt
+}
+
+proc RobotDaemonLoop {} {
+    global daemon_cnt tasks robotsRunning status
+
+    set daemon_cnt 0
+    while 1 {
+        logmsg $daemon_cnt
+        RobotScanDir
+        
+        if {[info exists tasks]} {
+            logmsg "daemon loop tasks $tasks"
+            foreach t $tasks {
+                set control($t,seq) 0
+                RobotStart $t
+            }
+            while {$robotsRunning} {
+                vwait robotsRunning
+            }
+        }
+        after 30000 RobotDaemonSig
+        vwait daemon_cnt
+    }
+}
+
+proc RobotRestart {task url sock} {
+    global URL robotsRunning
+
+    close $sock
+    after cancel $URL($sock,cancel) 
+
+    foreach v [array names URL $task,$url,*] {
+       unset URL($v)
+    }
+
+    incr robotsRunning -1
+    RobotStart $task
+}
+
+proc RobotStart {task} {
+    global URL
+    global robotsRunning robotsMax idletime status tasks
+  
+    # dbgmsg "RobotStart $task running=$robotsRunning"
+    while {1} {
+        set url [RobotFileNext $task unvisited]
+       if {[string compare $url done] == 0} {
+            dbgmsg "In RobotStart task $task done"
+
+            catch {unset ntasks}
+            foreach t $tasks {
+                if {[string compare $t $task]} {
+                    lappend ntasks $t
+                } else {
+                    dbgmsg "task $t done"
+                }
+            }
+            if {![info exists ntasks]} {
+                unset tasks
+                dbgmsg "all done"
+            } else {
+                set tasks $ntasks
+            }
+            RobotDoneJob $task
+           return
+       }
+        if {![string length $url]} {
+           return
+       }
+        incr robotsRunning
+       if {[string compare $url wait] == 0} {
+            after $idletime [list RobotRR $task]
+            return
+       }
+        set r [RobotGetUrl $task $url {}]
+        if {!$r} {
+           if {$robotsRunning >= $robotsMax} return
+        } else {
+           incr robotsRunning -1
+           if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
+               set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
+               RobotFileClose $outf
+           }
+            RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
+       }
+    }
+}
+
+proc headSave {task url out} {
+    global URL
+    
+    if {[info exists URL($task,$url,head,last-modified)]} {
+        puts $out "<lastmodified>$URL($task,$url,head,last-modified)</lastmodified>"
+    }
+    puts $out {<si>}
+    if {[info exists URL($task,$url,head,date)]} {
+        puts $out " <date>$URL($task,$url,head,date)</date>"
+    }
+    if {[info exists URL($task,$url,head,content-length)]} {
+        puts $out " <by>$URL($task,$url,head,content-length)</by>"
+    }
+    if {[info exists URL($task,$url,head,server)]} {
+        puts $out " <format>$URL($task,$url,head,server)</format>"
+    }
+    puts $out {</si>}
+    puts $out {<publisher>}
+    puts $out " <identifier>$url</identifier>"
+    if {[info exists URL($task,$url,head,content-type)]} {
+        puts $out " <type>$URL($task,$url,head,content-type)</type>"
+    }
+    puts $out {</publisher>}
+}
+
+proc RobotHref {task url hrefx hostx pathx} {
+    global URL control debuglevel
+    upvar $hrefx href
+    upvar $hostx host
+    upvar $pathx path
+
+    if {$debuglevel > 1} {
+        dbgmsg "Ref input url = $url href=$href"
+    }
+
+    if {[string first { } $href] >= 0} {
+       return 0
+    }
+    if {[string length $href] > 256} {
+       return 0
+    }
+
+#   Skip pages that have ? in them
+#    if {[string first {?} $url] >= 0 && [string first {?} $href] >= 0} {
+#      return 0
+#    }
+    # get method (if any)
+    if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
+       set hpath $href
+       set method http
+    } else {
+       if {[string compare $method http]} {
+           return 0
+       }
+    }
+    # get host (if any)
+    if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
+       if {![string length $surl]} {
+           set surl /
+       }
+        if {[info exist control($task,domains)]} {
+           set ok 0
+           foreach domain $control($task,domains) {
+               if {[string match $domain $host]} {
+                   set ok 1
+                   break
+                }
+           }
+           if {!$ok} {
+               return 0
+           }
+        }
+    } else {
+       regexp {^([^\#]*)} $hpath x surl
+       set host $URL($task,$url,hostport)
+    }
+    if {![string length $surl]} {
+       return 0
+    }
+    if {[string first / $surl]} {
+       # relative path
+        set curpath $URL($task,$url,path)
+        if {[info exists URL($task,$url,bpath)]} {
+            set curpath $URL($task,$url,bpath)
+        }
+       regexp {^([^\#?]*)} $curpath x dpart
+       set l [string last / $dpart]
+       if {[expr $l >= 0]} {
+           set surl [string range $dpart 0 $l]$surl
+       } else {
+           set surl $dpart/$surl
+       }
+    }
+    set surllist [split $surl /]
+    catch {unset path}
+    set pathl 0
+    foreach c $surllist {
+        switch -- $c {
+           .. {
+               if {$pathl > 1} {
+                   incr pathl -2
+                   set path [lrange $path 0 $pathl]
+                   incr pathl
+               }
+           }
+            . {
+
+            }
+            default {
+               incr pathl
+                lappend path $c
+           }
+       }
+    }
+    if {$debuglevel > 4} {
+        dbgmsg "pathl=$pathl output path=$path"
+    }
+    set path [join $path /]
+    if {![string length $path]} {
+       set path /
+    }
+    regsub -all {~} $path {%7E} path
+    set href "$method://$host$path"
+
+    if {$debuglevel > 1} {
+        dbgmsg "Ref result = $href"
+    }
+    return [checkrule $task url $href]
+}
+
+proc RobotError {task url code} {
+    global URL
+
+    dbgmsg "Bad URL $url (code $code)"
+    set fromurl {}
+    set distance -1
+    if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
+       set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
+       RobotReadRecord $inf fromurl distance
+       RobotFileClose $inf
+    }
+    RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
+    if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
+       set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
+       RobotWriteRecord $outf $fromurl $distance
+       RobotFileClose $outf
+    }
+}
+
+proc RobotRedirect {task url tourl code} {
+    global URL
+
+    dbgmsg "Redirecting from $url to $tourl"
+
+    set distance {}
+    set fromurl {}
+    if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
+       set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
+       RobotReadRecord $inf fromurl distance
+       RobotFileClose $inf
+    }
+    if {![RobotFileExist $task bad $URL($task,$url,hostport) $URL($task,$url,path)]} {
+       set outf [RobotFileOpen $task bad $URL($task,$url,hostport) $URL($task,$url,path)]
+       RobotWriteRecord $outf $fromurl $distance
+       RobotFileClose $outf
+    }
+    if {[RobotHref $task $url tourl host path]} {
+       if {![RobotFileExist $task visited $host $path]} {
+           if {![RobotFileExist $task unvisited $host $path]} {
+               set outf [RobotFileOpen $task unvisited $host $path]
+               RobotWriteRecord $outf $fromurl $distance
+               RobotFileClose $outf
+           }
+       } else {
+           set olddistance {}
+           set inf [RobotFileOpen $task visited $host $path r]
+           RobotReadRecord $inf oldurl olddistance
+           RobotFileClose $inf
+           if {[string length $olddistance] == 0} {
+               set olddistance 1000
+           }
+           if {[string length $distance] == 0} {
+               set distance 1000
+           }
+           dbgmsg "distance=$distance olddistance=$olddistance"
+           if {[expr $distance < $olddistance]} {
+               set outf [RobotFileOpen $task unvisited $host $path]
+               RobotWriteRecord $outf $tourl $distance
+               RobotFileClose $outf
+           }
+       }
+    }
+    if {[catch {RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)}]} {
+        dbgmsg "unlink failed"
+        exit 1
+    }
+}
+
+proc wellform {body} {
+    regsub -all {<!--[^-]*-->} $body { } abody
+    regsub -all -nocase {<script[^<]*</script>} $abody {} body
+    regsub -all {<[^\>]+>} $body {} abody
+    regsub -all {&nbsp;} $abody { } body
+    regsub -all {&} $body {&amp;} abody
+    return $abody
+}
+
+proc link {task url out href body distance} {
+    global URL control
+    if {[expr $distance > $control($task,distance)]} return
+    
+    if {![RobotHref $task $url href host path]} return
+    
+    if ($control($task,cr)) {
+       puts $out "<cr>"
+       puts $out "<identifier>$href</identifier>"
+       set abody [wellform $body]
+       puts $out "<description>$abody</description>"
+       puts $out "</cr>"
+    }
+    
+    if {![RobotFileExist $task visited $host $path]} {
+        set olddistance 1000
+        if {![RobotFileExist $task bad $host $path]} {
+            if {[RobotFileExist $task unvisited $host $path]} {
+                set inf [RobotFileOpen $task unvisited $host $path r]
+                RobotReadRecord $inf oldurl olddistance
+                RobotFileClose $inf
+            }
+        } else {
+            set olddistance 0
+        }
+        if {[string length $olddistance] == 0} {
+            set olddistance 1000
+        }
+        if {[expr $distance < $olddistance]} {
+            set outf [RobotFileOpen $task unvisited $host $path]
+            RobotWriteRecord $outf $url $distance
+            RobotFileClose $outf
+        }
+    } elseif {[string compare $href $url]} {
+        set inf [RobotFileOpen $task visited $host $path r]
+        RobotReadRecord $inf xurl olddistance
+        close $inf
+        if {[string length $olddistance] == 0} {
+            set olddistance 1000
+        }
+        if {[expr $distance < $olddistance]} {
+            dbgmsg "OK remarking url=$url href=$href"
+            dbgmsg "olddistance = $olddistance"
+            dbgmsg "newdistance = $distance"
+            set outf [RobotFileOpen $task unvisited $host $path]
+            RobotWriteRecord $outf $url $distance
+            RobotFileClose $outf
+        }
+    }
+}
+
+proc RobotTextTkl {task url out} {
+    global URL control
+
+    # set title so we can emit it for the body
+    set title {}
+    # if true, nothing will be indexed
+    set noindex 0
+    # if true, nothing will be followed
+    set nofollow 0
+
+    puts $control($task,output)
+
+    set out stdout
+    set distance distance
+
+    htmlSwitch $URL($task,$url,buf) \
+        title {
+            # nÃ¥r title tag er hittet, er body set til indholdet af tagget
+            set title $body
+        } -nonest meta {
+            #puts -nonewline $out "<meta"
+            # al er list med attribut navne som fandtes ind i parm hash
+           set al [array names parm]  
+            # løkke igennem attributer
+            foreach a $al {
+                set al [string tolower $a]
+               #puts -nonewline $out " $al"
+                #puts -nonewline $out {="}
+                #puts -nonewline $out $parm($a)
+                #puts -nonewline $out {"}
+               unset parm($al)
+            }
+           #puts $out "></meta>"
+       } body {
+            # don't print title of document content if noindex is used
+            if {!$noindex} {
+                #puts $out "<title>$title</title>"
+                # xml compilancy added
+               set bbody [wellform $body]
+                #puts $out "<documentcontent>"
+                #puts $out $bbody
+                #puts $out "</documentcontent>"
+            }
+        } -nonest base {
+            # <base href=.. >
+            if {![info exists parm(href)]} {
+               continue
+            }
+            set href [string trim $parm(href)]
+        } a {
+            # <a href="...."> .. </a> 
+            # we're not using nonest - otherwise body isn't set
+            if {$nofollow} continue
+            if {![info exists parm(href)]} {
+               continue
+            }
+            #puts "link $task $url $out [string trim $parm(href)] $body $distance"
+        } -nonest area {
+            if {$nofollow} continue
+            if {![info exists parm(href)]} {
+               continue
+            }
+            #puts "link $task $url $out [string trim $parm(href)] $body $distance"
+        } -nonest frame {
+            if {![info exists parm(src)]} {
+               continue
+            }
+            #puts "link $task $url $out [string trim $parm(src)] $body $fdistance"
+       }
+}
+
+proc RobotTextHtml {task url out} {
+    global URL control
+
+    # set title so we can emit it for the body
+    set title {}
+    # if true, nothing will be indexed
+    set noindex 0
+    # if true, nothing will be followed
+    set nofollow 0
+
+    set distance 0
+    set fdistance 0
+    if {$control($task,distance) < 1000 && [info exists URL($task,$url,dist)]} {
+        set fdistance $URL($task,$url,dist)
+       set distance [expr $fdistance + 1]
+    }
+    htmlSwitch $URL($task,$url,buf) \
+        title {
+            set title $body
+        } -nonest meta {
+            # collect metadata and save NAME= CONTENT=..
+            set metaname {}
+            set metacontent {}
+            puts -nonewline $out "<meta"
+           set al [array names parm]
+            foreach a $al {
+                set al [string tolower $a]
+               puts -nonewline $out " $al"
+                puts -nonewline $out {="}
+                puts -nonewline $out $parm($a)
+                puts -nonewline $out {"}
+                switch -- $al {
+                    "name" {
+                        set metaname [string tolower $parm($a)]
+                    }
+                    "content" {
+                        set metacontent $parm($a)
+                    }
+                }
+               unset parm($al)
+            }
+           puts $out "></meta>"
+            # go through robots directives (af any)
+            if {![string compare $metaname robots]} {
+                set direcs [split [string tolower $metacontent] ,]
+                if {[lsearch $direcs noindex] >= 0} {
+                    set noindex 1
+                }
+                if {[lsearch $direcs nofollow] >= 0} {
+                    set nofollow 1
+                }
+            }
+       } body {
+            # don't print title of document content if noindex is used
+            if {!$noindex} {
+                puts $out "<title>$title</title>"
+               set bbody [wellform $body]
+                puts $out "<documentcontent>"
+                puts $out $bbody
+                puts $out "</documentcontent>"
+            }
+        } -nonest base {
+            # <base href=.. >
+            if {![info exists parm(href)]} {
+               continue
+            }
+            set href [string trim $parm(href)]
+            if {![RobotHref $task $url href host path]} continue
+            set URL($task,$url,bpath) $path
+        } a {
+            # <a href="...."> .. </a> 
+            # we're not using nonest - otherwise body isn't set
+            if {$nofollow} continue
+            if {![info exists parm(href)]} {
+               continue
+            }
+            link $task $url $out [string trim $parm(href)] $body $distance
+        } -nonest area {
+            if {$nofollow} continue
+            if {![info exists parm(href)]} {
+               continue
+            }
+            link $task $url $out [string trim $parm(href)] $body $distance
+        } -nonest frame {
+            if {![info exists parm(src)]} {
+               continue
+            }
+            link $task $url $out [string trim $parm(src)] $body $fdistance
+       }
+}
+
+proc RobotsTxt {task url} {
+    global agent URL
+
+    RobotsTxt0 $task URL(URL($task,$url,hostport),robots) $URL($task,$url,buf)
+}
+
+proc RobotsTxt0 {task v buf} {
+    global URL agent
+    set section 0
+    foreach l [split $buf \n] {
+       if {[regexp {([-A-Za-z]+):[ ]*([^\# ]+)} $l match cmd arg]} {
+            set arg [string trim $arg]
+           dbgmsg "cmd=$cmd arg=$arg"
+           switch -- [string tolower $cmd] {
+               user-agent {
+                   if {$section} break
+                   set pat [string tolower $arg]*
+                   set section [string match $pat $agent]
+               }
+               disallow {
+                   if {$section} {
+                       dbgmsg "rule [list 0 $arg]"
+                       lappend $v [list 0 $arg]
+                   }
+               }
+               allow {
+                   if {$section} {
+                       dbgmsg "rule [list 1 $arg]"
+                       lappend $v [list 1 $arg]
+                   }
+               }
+           }
+       }
+    }
+}
+
+proc RobotTextPlain {task url out} {
+    global URL
+
+    puts $out "<documentcontent>"
+    regsub -all {<} $URL($task,$url,buf) {\&lt;} content
+    puts $out $content
+    puts $out "</documentcontent>"
+
+    if {![string compare $URL($task,$url,path) /robots.txt]} {
+       RobotsTxt $task $url
+    }
+}
+
+proc RobotWriteMetadata {task url out} {
+    global URL
+
+    set charset $URL($task,$url,charset)
+    puts $out "<?xml version=\"1.0\" encoding=\"$charset\" standalone=\"yes\"?>"
+    puts $out "<zmbot>"
+
+    set distance 1000
+    if {[RobotFileExist $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)]} {
+       set inf [RobotFileOpen $task unvisited $URL($task,$url,hostport) $URL($task,$url,path) r]
+       RobotReadRecord $inf fromurl distance
+       RobotFileClose $inf
+    }
+    set URL($task,$url,dist) $distance
+    puts $out "<distance>"
+    puts $out "  $distance"
+    puts $out "</distance>"
+    headSave $task $url $out
+    logmsg "Parsing $url distance=$distance"
+    switch $URL($task,$url,head,content-type) {
+        text/html {
+            if {[string length $distance]} {
+                RobotTextHtml $task $url $out
+                RobotTextTkl $task $url $out
+            }
+        }
+        text/plain {
+            RobotTextPlain $task $url $out
+        }
+    }
+    puts $out "</zmbot>"
+}
+
+proc Robot200 {task url} {
+    global URL
+    
+    set out [RobotFileOpen $task raw $URL($task,$url,hostport) $URL($task,$url,path)]
+    puts -nonewline $out $URL($task,$url,buf)
+    RobotFileClose $out
+
+    set out [RobotFileOpen $task visited $URL($task,$url,hostport) $URL($task,$url,path)]
+    RobotWriteMetadata $task $url $out
+    RobotFileClose $out
+
+    RobotFileUnlink $task unvisited $URL($task,$url,hostport) $URL($task,$url,path)
+}
+
+proc RobotReadContent {task url sock binary} {
+    global URL
+
+    set buffer [read $sock 16384]
+    set readCount [string length $buffer]
+
+    if {$readCount <= 0} {
+       Robot200 $task $url
+       RobotRestart $task $url $sock
+    } elseif {!$binary && [string first \0 $buffer] >= 0} {
+       Robot200 $task $url
+       RobotRestart $task $url $sock
+    } else {
+       # dbgmsg "Got $readCount bytes"
+       set URL($task,$url,buf) $URL($task,$url,buf)$buffer
+    }
+}
+
+proc RobotReadHeader {task url sock} {
+    global URL debuglevel
+
+    if {$debuglevel > 1} {
+        dbgmsg "HTTP head $url"
+    }
+    if {[catch {set buffer [read $sock 2148]}]} {
+       RobotError $task $url 404
+       RobotRestart $task $url $sock
+        return
+    }
+    set readCount [string length $buffer]
+    
+    if {$readCount <= 0} {
+       RobotError $task $url 404
+       RobotRestart $task $url $sock
+    } else {
+       # dbgmsg "Got $readCount bytes"
+       set URL($task,$url,buf) $URL($task,$url,buf)$buffer
+       
+       set n [string first \r\n\r\n $URL($task,$url,buf)]
+       if {$n > 1} {
+           set code 0
+           set version {}
+           set headbuf [string range $URL($task,$url,buf) 0 $n]
+           incr n 4
+           set URL($task,$url,charset) ISO-8859-1
+           set URL($task,$url,buf) [string range $URL($task,$url,buf) $n end]
+           
+           regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
+           set lines [split $headbuf \n]
+           foreach line $lines {
+               if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
+                   set URL($task,$url,head,[string tolower $name]) [string trim $value]
+               }
+               regexp {^Content-Type:.*charset=([A-Za-z0-9_-]*)} $line x URL($task,$url,charset)
+           }
+           dbgmsg "HTTP CODE $code"
+           set URL($task,$url,state) skip
+           switch $code {
+               301 {
+                   RobotRedirect $task $url $URL($task,$url,head,location) 301
+                   RobotRestart $task $url $sock
+               }
+               302 {
+                   RobotRedirect $task $url $URL($task,$url,head,location) 302
+                   RobotRestart $task $url $sock
+               }
+               200 {
+                   if {![info exists URL($task,$url,head,content-type)]} {
+                       set URL($task,$url,head,content-type) {}
+                   }
+                   set binary 1
+                   switch -glob -- $URL($task,$url,head,content-type) {
+                       text/* {
+                           set binary 0
+                       }
+                   }
+                    if {![regexp {/robots.txt$} $url]} {
+                        if {![checkrule $task mime $URL($task,$url,head,content-type)]} {
+                            RobotError $task $url mimedeny
+                            RobotRestart $task $url $sock
+                            return
+                        }
+                    }
+                   fileevent $sock readable [list RobotReadContent $task $url $sock $binary]
+               }
+               default {
+                   RobotError $task $url $code
+                   RobotRestart $task $url $sock
+               }
+           }
+       }
+    }
+}
+
+proc RobotSockCancel {task url sock} {
+
+    logmsg "RobotSockCancel sock=$sock url=$url"
+    RobotError $task $url 401
+    RobotRestart $task $url $sock
+}
+
+proc RobotConnect {task url sock} {
+    global URL agent acceptLanguage
+
+    fconfigure $sock -translation {lf crlf} -blocking 0
+    fileevent $sock readable [list RobotReadHeader $task $url $sock]
+    puts $sock "GET $URL($task,$url,path) HTTP/1.0"
+    puts $sock "Host: $URL($task,$url,host)"
+    puts $sock "User-Agent: $agent"
+    if {[string length $acceptLanguage]} {
+        puts $sock "Accept-Language: $acceptLanguage"
+    }
+    puts $sock ""
+    set URL($sock,cancel) [after 30000 [list RobotSockCancel $task $url $sock]]
+    if {[catch {flush $sock}]} {
+        RobotError $task $url 404
+       RobotRestart $task $url $sock
+    }
+}
+
+proc RobotNop {} {
+
+}
+
+proc RobotGetUrl {task url phost} {
+    global URL robotsRunning
+    flush stdout
+    dbgmsg "Retrieve running=$robotsRunning url=$url task=$task"
+    if {![regexp {([^:]+)://([^/]+)(.*)} $url x method hostport path]} {
+        return -1
+    }
+    if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
+       set port 80
+       set host $hostport
+    }
+    set URL($task,$url,method) $method
+    set URL($task,$url,host) $host
+    set URL($task,$url,hostport) $hostport
+    set URL($task,$url,path) $path
+    set URL($task,$url,state) head
+    set URL($task,$url,buf) {}
+
+    if {[string compare $path /robots.txt]} {
+       set ok 1
+       if {![info exists URL($hostport,robots)]} {
+           dbgmsg "READING robots.txt for host $hostport"
+           if {[RobotFileExist $task visited $hostport /robots.txt]} {
+               set inf [RobotFileOpen $task visited $hostport /robots.txt r]
+               set buf [read $inf 32768]
+               close $inf
+           } else {
+               set buf "User-agent: *\nAllow: /\n"
+           }
+           RobotsTxt0 $task URL($hostport,robots) $buf
+       }
+       if {[info exists URL($hostport,robots)]} {
+           foreach l $URL($hostport,robots) {
+               if {[string first [lindex $l 1] $path] == 0} {
+                   set ok [lindex $l 0]
+                   break
+               }
+           }
+       }
+       if {!$ok} {
+           dbgmsg "skipped due to robots.txt"
+           return -1
+       }
+    }
+    if [catch {set sock [socket -async $host $port]}] {
+        return -1
+    }
+    RobotConnect $task $url $sock
+
+    return 0
+}
+
+proc loadlib {} {
+    global libdir
+
+    if {![llength [info commands htmlSwitch]]} {
+        if {[info exists env(tclrobot_lib)]} {
+           set d $env(tclrobot_lib)
+        } else {
+            if { $libdir > "" } {
+                set d $libdir
+            } else {
+               set d .
+            }
+        }
+        set e [info sharedlibextension]
+        dbgmsg "About to load $d/tclrobot$e"
+        if {[catch {load $d/tclrobot$e}]} {
+            dbgmsg "Didn't get at $d, trying directly"
+           load tclrobot$e
+        }
+        dbgmsg "Loaded tclrobot$e all right"
+    }
+}
+
+set agent "zmbot/0.2"
+if {![catch {set os [exec uname -s -r]}]} {
+    set agent "$agent ($os)"
+}
+
+dbgmsg "agent: $agent"
+
+proc bgerror {m} {
+    global errorInfo
+    dbgmsg "BGERROR $m"
+    dbgmsg $errorInfo
+}
+
+# Rules: allow, deny, url
+
+proc checkrule {task type this} {
+    global control
+    global debuglevel
+
+    set default_ret 1
+
+    if {$debuglevel > 3} {
+        dbgmsg "CHECKRULE $type $this"
+    }
+    if {[info exist control($task,alrules)]} {
+        foreach l $control($task,alrules) {
+            if {$debuglevel > 3} {
+                dbgmsg "consider $l"
+            }
+            # consider type
+            if {[lindex $l 1] != $type} continue
+            # consider mask (! negates)
+            set masks [lindex $l 2]
+           set ok 0
+           set default_ret 0
+           foreach mask $masks {       
+                if {$debuglevel > 4} {
+                    dbgmsg "consider single mask $mask"
+                }
+                if {[string index $mask 0] == "!"} {
+                    set mask [string range $mask 1 end]
+                    if {[string match $mask $this]}  continue
+                } else {
+                    if {![string match $mask $this]} continue
+                }
+                set ok 1
+            }
+            if {$debuglevel > 4} {
+                dbgmsg "ok = $ok"
+            }
+            if {!$ok} continue
+            # OK, we have a match
+            if {[lindex $l 0] == "allow"} {
+                if {$debuglevel > 3} {
+                    dbgmsg "CHECKRULE MATCH OK"
+                }
+                return 1
+            } else {
+                if {$debuglevel > 3} {
+                    dbgmsg "CHECKFULE MATCH FAIL"
+                }
+                return 0
+            }
+        }
+    }
+    if {$debuglevel > 3} {
+        dbgmsg "CHECKRULE MATCH DEFAULT $default_ret"
+    }
+    return $default_ret
+}
+
+
+proc url {href} {
+    global debuglevel task
+
+    if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
+        if {![RobotFileExist $task visited $host $path]} {
+            set outf [RobotFileOpen $task unvisited $host $path]
+            RobotWriteRecord $outf href 0
+            RobotFileClose $outf
+        }
+    }
+}
+
+proc deny {type stuff} {
+    global control task
+
+    lappend control($task,alrules) [list deny $type $stuff]
+}
+
+proc allow {type stuff} {
+    global control task
+
+    lappend control($task,alrules) [list allow $type $stuff]
+}
+
+proc debug {level} {
+    global debuglevel
+
+    set debuglevel $level
+}
+
+proc CreateTask {t} {
+    global tasks task status control
+
+    set task $t
+
+    if {[info exists tasks]} {
+        if {[lsearch -exact $tasks $t] >= 0} {
+            return 0
+        }
+    }
+
+    lappend tasks $t
+    set status($t,unvisited) 0
+    set status($t,visited) 0
+    set status($t,bad) 0
+    set status($t,raw) 0
+    set status($t,active) 1
+    set control($t,seq) 0
+    set control($t,distance) 10
+    set control($t,target) tmp
+    set control($t,output) output
+    set control($t,cr) 0
+    return 1
+}
+
+# Little utility that ensures that at least one task is present (main).
+proc CreateMainTask {} {
+    global tasks
+    if {![info exist tasks]} {
+        CreateTask main
+    }
+}
+
+# Parse options
+
+set i 0
+set l [llength $argv]
+
+if {$l < 1} {
+    puts {tclrobot: usage:}
+    puts {tclrobot [-j jobs] [-p pid] [-T tmpdir] [-o logfile] [-i idle] [-c
+    count] [-d domain] [-D spooldir] [-r rules] [-L libdir] [url ..]}
+    logmsg " Example: -c 3 -d '*.dk' http://www.indexdata.dk/"
+
+    exit 1
+}
+
+
+while  {$i < $l} {
+    set arg [lindex $argv $i]
+    switch -glob -- $arg {
+       -o* {
+           set fname [string range $arg 2 end]
+           if {![string length $fname]} {
+               set fname [lindex $argv [incr i]]
+           }
+           set loghandle [open $fname a]
+           #dbgmsg "agent: $agent"
+           #dbgmsg "-o $fname"
+       }
+       -p* {
+           set pidfname [string range $arg 2 end]
+           if {![string length $pidfname]} {
+               set pidfname [lindex $argv [incr i]]
+           }
+           #dbgmsg "-p $pidfname"
+           if {[file exists $pidfname]} {
+               set pf [open $pidfname]
+               gets $pf oldpid
+               close $pf
+               logmsg "File $pidfname already exist. pid=$oldpid"
+               if {[file isdirectory /proc/$oldpid]} {
+                   logmsg "And it's apparently running. Exiting."
+                   exit 1
+               }
+           }
+           set pf [open $pidfname w]
+           puts $pf [pid]
+           close $pf
+       }
+       -T* {
+           set tmpdir [string range $arg 2 end]
+           if {![string length $tmpdir]} {
+               set tmpdir [lindex $argv [incr i]]
+           }
+            set control(tmpdir) $tmpdir
+       }
+       -L* {
+           set libdir [string range $arg 2 end]
+           if {![string length $libdir]} {
+               set libdir [lindex $argv [incr i]]
+           }
+       }
+        -t* {
+           set t [string range $arg 2 end]
+           if {![string length $t]} {
+               set t [lindex $argv [incr i]]
+           }
+            CreateTask $t
+       }
+        -D* {
+           set dir [string range $arg 2 end]
+           if {![string length $dir]} {
+               set dir [lindex $argv [incr i]]
+           }
+            lappend daemon_dir $dir
+        }
+       -j* {
+           set robotsMax [string range $arg 2 end]
+           if {![string length $robotsMax]} {
+               set robotsMax [lindex $argv [incr i]]
+           }
+       }
+       -c* {
+            CreateMainTask
+           set control($task,distance) [string range $arg 2 end]
+           if {![string length $control($task,distance)]} {
+               set control($task,distance) [lindex $argv [incr i]]
+           }
+       }
+       -d* {
+            CreateMainTask
+           set dom [string range $arg 2 end]
+           if {![string length $dom]} {
+               set dom [lindex $argv [incr i]]
+           }
+           lappend control($task,domains) $dom
+       }
+       -i* {
+           set idletime [string range $arg 2 end]
+           if {![string length $idletime]} {
+               set idletime [lindex $argv [incr i]]
+           }
+       }
+        -l* {
+            CreateMainTask
+           set acceptLanguage [string range $arg 2 end]
+           if {![string length $acceptLanguage]} {
+               set acceptLanguage [lindex $argv [incr i]]
+           }
+       }
+        -r* {
+            CreateMainTask
+           set rfile [string range $arg 2 end]
+           if {![string length $rfile]} {
+               set rfile [lindex $argv [incr i]]
+           }
+            catch {unset maxdistance}
+            source $rfile
+            if {[info exists maxdistance]} {
+                set control($task,distance) $maxdistance
+            }
+        }
+       default {
+            CreateMainTask
+           set href $arg
+           #dbgmsg "in default: arg= $arg !!!"
+        loadlib
+           if {[RobotHref $task http://www.indexdata.dk/ href host path]} {
+               if {![RobotFileExist $task visited $host $path]} {
+                   set outf [RobotFileOpen $task unvisited $host $path]
+                   RobotWriteRecord $outf href 0
+                   RobotFileClose $outf
+               }
+           }
+       }
+    }
+    incr i
+}
+
+
+dbgmsg "Parsed args, now loading"
+loadlib
+
+if {![info exist robotsMax]} {
+    set robotsMax 5
+}
+
+if {[info exist daemon_dir]} {
+    logmsg "Daemon mode"
+    RobotDaemonLoop
+} else {
+    foreach t $tasks {
+       logmsg "task $t"
+       logmsg "max distance=$control($t,distance)"
+       if {[info exists control($t,domains)]} {
+           logmsg "domains=$control($t,domains)"
+       }
+    }
+    logmsg "max jobs=$robotsMax"
+    
+    foreach t $tasks {
+       RobotStart $t
+    }
+    
+    while {$robotsRunning} {
+       vwait robotsRunning
+    }
+    
+    if {[info exists tasks]} {
+       foreach t $tasks {
+           set statusfile [open $t/status w]
+           puts $statusfile "$status($t,unvisited) $status($t,bad) $status($t,visited)"
+           close $statusfile
+       }
+    }
+}
+