--- /dev/null
+#!/usr/bin/tclsh
+# $Id: dcdot.tcl,v 1.1 2000/12/07 20:16:11 adam Exp $
+#
+
+proc RobotRestart {} {
+ global robotMoreWork
+
+ set robotMoreWork 0
+}
+
+proc RobotTextHtml {url} {
+ global URL
+
+ set head 0
+ htmlSwitch $URL($url,buf) \
+ title {
+ set URL($url,title) $body
+ } -nonest meta {
+ set scheme {}
+ if {[info exist parm(scheme)]} {
+ set scheme $parm(scheme)
+ unset parm(scheme)
+ }
+ if {[info exist parm(name)]} {
+ if {[info exist parm(content)]} {
+ set URL($url,meta,$parm(name),$scheme) $parm(content)
+ unset parm(content)
+ }
+ unset parm(name)
+ }
+ } a {
+ if {[info exists parm(href)]} {
+ lappend URL($url,links) $parm(href)
+ }
+ }
+}
+
+proc Robot200 {url} {
+ global URL domains
+
+ # puts "Parsing $url"
+ switch $URL($url,head,content-type) {
+ text/html {
+ RobotTextHtml $url
+ }
+ }
+ # puts "Parsing done"
+}
+
+proc RobotReadContent {url sock} {
+ global URL
+
+ set buffer [read $sock 16384]
+ set readCount [string length $buffer]
+
+ if {$readCount <= 0} {
+ close $sock
+ Robot200 $url
+ RobotRestart
+ } else {
+ # puts "Got $readCount bytes"
+ set URL($url,buf) $URL($url,buf)$buffer
+ }
+}
+
+proc RobotReadHeader {url sock} {
+ global URL
+
+ set buffer [read $sock 2148]
+ set readCount [string length $buffer]
+
+ if {$readCount <= 0} {
+ close $sock
+ RobotRestart
+ } else {
+ # puts "Got $readCount bytes"
+ set URL($url,buf) $URL($url,buf)$buffer
+
+ set n [string first \n\n $URL($url,buf)]
+ if {$n > 1} {
+ set code 0
+ set version {}
+ set headbuf [string range $URL($url,buf) 0 $n]
+ incr n
+ incr n
+ set URL($url,buf) [string range $URL($url,buf) $n end]
+
+ regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
+ set lines [split $headbuf \n]
+ foreach line $lines {
+ if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
+ set URL($url,head,[string tolower $name]) $value
+ }
+ }
+ set URL($url,state) skip
+ switch $code {
+ 200 {
+ if {![info exists URL($url,head,content-type)]} {
+ set URL($url,head,content-type) {}
+ }
+ switch $URL($url,head,content-type) {
+ text/html {
+ fileevent $sock readable [list RobotReadContent $url $sock]
+ }
+ text/plain {
+ fileevent $sock readable [list RobotReadContent $url $sock]
+ }
+ default {
+ close $sock
+ Robot200 $url
+ RobotRestart
+ }
+ }
+ }
+ default {
+ Robot404 $url
+ close $sock
+ RobotRestart
+ }
+ }
+ }
+ }
+}
+
+proc RobotConnect {url sock} {
+ global URL agent
+
+ fconfigure $sock -translation {auto crlf} -blocking 0
+ fileevent $sock readable [list RobotReadHeader $url $sock]
+ puts $sock "GET $URL($url,path) HTTP/1.0"
+ puts $sock "Host: $URL($url,host)"
+ puts $sock "User-Agent: $agent"
+ puts $sock ""
+ flush $sock
+}
+
+proc RobotGetUrl {url phost} {
+ global URL
+ if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} {
+ return -1
+ }
+ if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
+ set port 80
+ set host $hostport
+ }
+ set URL($url,method) $method
+ set URL($url,host) $host
+ set URL($url,port) $port
+ set URL($url,path) $path
+ set URL($url,state) head
+ set URL($url,buf) {}
+ if [catch {set sock [socket -async $host $port]}] {
+ return -1
+ }
+ RobotConnect $url $sock
+
+ return 0
+}
+
+if {![llength [info commands htmlSwitch]]} {
+ set e [info sharedlibextension]
+ if {[catch {load ./tclrobot$e}]} {
+ load tclrobot$e
+ }
+}
+
+set agent "zmbot/0.0"
+if {![catch {set os [exec uname -s -r]}]} {
+ set agent "$agent ($os)"
+}
+
+proc RobotGetDCDOT {url} {
+ global robotMoreWork 1
+
+ set robotMoreWork 1
+ if [RobotGetUrl $url {}] {
+ set robotMoreWork 0
+ }
+
+ while {$robotMoreWork} {
+ vwait robotMoreWork
+ }
+}
+
+if {$argc == 1} {
+ set url [lindex $argv 0]
+ RobotGetDCDOT $url
+ set mask {,meta,[Dd][Cc]\.*}
+ foreach a [array names URL $url$mask] {
+ puts "URL($a) = $URL($a)"
+ }
+}
\ No newline at end of file
#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.5 1999/12/27 11:49:31 adam Exp $
+# $Id: robot.tcl,v 1.6 2000/12/07 20:16:11 adam Exp $
#
proc RobotFileNext {area} {
if {[catch {set ns [glob ${area}/*]}]} {
}
return http://[string range $n $off end]
}
- if {[file isdirectory $n]} {
+ }
+ foreach n $ns {
+ if {[file isdirectory $n]} {
set sb [RobotFileNext $n]
if {[string length $sb]} {
return $sb
}
}
-proc RobotFileOpen {area host path} {
+proc RobotFileClose {out} {
+ if [string compare $out stdout] {
+ close $out
+ }
+}
+
+proc RobotFileOpen {area host path {mode w}} {
set orgPwd [pwd]
global workdir
- #puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path"
+ if {![info exists workdir]} {
+ return stdout
+ }
+ puts "RobotFileOpen orgPwd=$orgPwd area=$area host=$host path=$path"
if {[string compare $orgPwd $workdir]} {
puts "workdir = $workdir"
puts "pwd = $orgPwd"
}
set d [lindex $comp $len]
if {[string length $d]} {
- set out [open $d w]
+ if {[file isdirectory $d]} {
+ set out [open $d/:.html $mode]
+ } else {
+ set out [open $d $mode]
+ }
} else {
- set out [open :.html w]
+ set out [open :.html $mode]
}
cd $orgPwd
#puts "RobotFileStop"
proc RobotRestart {} {
global URL
+ global robotMoreWork
while {1} {
set url [RobotFileNext unvisited]
if {![string length $url]} {
- puts "No more unvisited"
break
}
set r [RobotGetUrl $url {}]
RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
}
}
- exit 0
+ set robotMoreWork 0
}
-proc headSave {url out title} {
+proc headSave {url out} {
global URL
- puts $out {<meta>}
- puts $out "<title>$title</title>"
+ puts $out {<zmbot>}
if {[info exists URL($url,head,last-modified)]} {
puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
}
upvar $hostx host
upvar $pathx path
- # puts "Ref url = $url href=$href"
+ puts "Ref url = $url href=$href"
# get method (if any)
if {![regexp {^([^/:]+):(.*)} $href x method hpath]} {
set hpath $href
}
}
# get host (if any)
- if {![regexp {^//([^/]+)(.*)} $hpath x host epath]} {
- set epath $hpath
- set host $URL($url,host)
- } else {
- if {![string length $epath]} {
- set epath /
+ if {[regexp {^//([^/]+)([^\#]*)} $hpath x host surl]} {
+ if {![string length $surl]} {
+ set surl /
}
set ok 0
foreach domain $domains {
if {!$ok} {
return 0
}
+ } else {
+ regexp {^([^\#]*)} $hpath x surl
+ set host $URL($url,host)
}
- if {[regexp {^(\#|\?)} $epath]} {
- # within page
+ if {![string length $surl]} {
return 0
- } elseif {![regexp {^([/][^\#?]*)} $epath x path]} {
+ }
+ if {[string first / $surl]} {
# relative path
- set ext [file extension $URL($url,path)]
- if {[string compare $ext {}]} {
- set dpart [file dirname $URL($url,path)]
+ regexp {^([^\#?]*)} $URL($url,path) x dpart
+ set l [string last / $dpart]
+ if {[expr $l >= 0]} {
+ set surl [string range $dpart 0 $l]$surl
} else {
- set dpart $URL($url,path)
+ set surl $dpart/$surl
}
- regexp {^([^\#?]+)} $epath x path
- set path [string trimright $dpart /]/$path
}
- set c [split $path /]
+ set c [split $surl /]
set i [llength $c]
incr i -1
set path [lindex $c $i]
}
}
set href "$method://$host$path"
- # puts "Ref href = $href"
+ puts "Ref href = $href"
return 1
}
proc Robot401 {url} {
global URL
- puts "Bad link $url"
+ puts "Bad URL $url"
+ set fromurl {}
+ catch {
+ set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
+ set fromurl [gets $inf]
+ close $inf
+ }
RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
- if {![RobotFileExist forbidden $URL($url,host) $URL($url,path)]} {
- set outf [RobotFileOpen forbidden $URL($url,host) $URL($url,path)]
- close $outf
+ if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
+ puts $outf "URL=$url 401"
+ puts $outf "Reference $fromurl"
+ RobotFileClose $outf
}
}
proc Robot404 {url} {
global URL
- puts "Bad link $url"
+ puts "Bad URL $url"
+ set fromurl {}
+ catch {
+ set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
+ set fromurl [gets $inf]
+ RobotFileClose $inf
+ }
RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
- close $outf
+ puts $outf "URL=$url 404"
+ puts $outf "Reference $fromurl"
+ RobotFileClose $outf
}
-}
+ }
proc Robot301 {url tourl} {
global URL
puts "Redirecting from $url to $tourl"
+
+ set fromurl {}
+ catch {
+ set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
+ set fromurl [gets $inf]
+ RobotFileClose $inf
+ }
RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
+ if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
+ set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
+ puts $outf "URL=$url to $tourl 301"
+ puts $outf "Reference $fromurl"
+ RobotFileClose $outf
+ }
if {[RobotHref $url tourl host path]} {
if {![RobotFileExist unvisited $host $path]} {
+ puts "Mark as unvisited"
set outf [RobotFileOpen unvisited $host $path]
- close $outf
+ puts $outf 301
+ RobotFileClose $outf
}
}
}
-proc Robot200 {url} {
- global URL domains
-
- # puts "Parsing $url"
- set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
- set ti 0
- if {[info exists URL($url,buf)]} {
- set htmlContent $URL($url,buf)
-
- htmlSwitch $htmlContent \
+proc RobotTextHtml {url out} {
+ global URL
+
+ set head 0
+ htmlSwitch $URL($url,buf) \
title {
- if {!$ti} {
- headSave $url $out $body
- set ti 1
+ if {!$head} {
+ headSave $url $out
+ set head 1
+ }
+ puts $out "<title>$body</title>"
+ } -nonest meta {
+ if {!$head} {
+ headSave $url $out
+ set head 1
+ }
+ puts -nonewline $out "<meta"
+ foreach a [array names parm] {
+ puts -nonewline $out " $a"
+ puts -nonewline $out {="}
+ puts -nonewline $out $parm($a)
+ puts -nonewline $out {"}
}
+ puts $out {></meta>}
} body {
regsub -all -nocase {<script.*</script>} $body {} abody
regsub -all {<[^\>]+>} $abody {} nbody
puts "no href"
continue
}
- if {!$ti} {
- headSave $url $out "untitled"
- set ti 1
+ if {!$head} {
+ headSave $url $out
+ set head 1
}
if {1} {
set href $parm(href)
puts $out "<identifier>$href</identifier>"
puts $out "<description>$body</description>"
puts $out "</cr>"
-
+
if {![RobotFileExist visited $host $path]} {
- if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} {
- puts "--- Error $msg"
- exit 1
+ if {![RobotFileExist bad $host $path]} {
+ if {[catch {set outf [RobotFileOpen unvisited $host $path]} msg]} {
+ puts "--- Error $msg"
+ exit 1
+ }
+ puts $outf $url
+ RobotFileClose $outf
}
- close $outf
}
}
- }
- }
- if {!$ti} {
- headSave $url $out "untitled"
- set ti 1
+ }
+ if {!$head} {
+ headSave $url $out
+ set head 1
}
+ puts $out "</zmbot>"
+}
+
+proc RobotTextPlain {url out} {
+ global URL
+
+ headSave $url $out
+ puts $out "<documentcontent>"
+ puts $out $URL($url,buf)
+ puts $out "</documentcontent>"
puts $out "</meta>"
- close $out
+}
+
+proc Robot200 {url} {
+ global URL domains
+
+ puts "Parsing $url"
+ set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
+ switch $URL($url,head,content-type) {
+ text/html {
+ RobotTextHtml $url $out
+ }
+ text/plain {
+ RobotTextPlain $url $out
+ }
+ default {
+ headSave $url $out
+ puts $out "</zmbot>"
+ }
+ }
+ RobotFileClose $out
# puts "Parsing done"
RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
}
-proc RobotReadBody {url sock} {
+proc RobotReadContent {url sock} {
global URL
set buffer [read $sock 16384]
}
}
-proc RobotReadHead {url sock} {
+proc RobotReadHeader {url sock} {
global URL
- set buffer [read $sock 8192]
+ set buffer [read $sock 2148]
set readCount [string length $buffer]
if {$readCount <= 0} {
RobotRestart
}
200 {
- if {[info exists URL($url,head,content-type)]} {
- if {![string compare $URL($url,head,content-type) text/html]} {
- set URL($url,state) html
- }
+ if {![info exists URL($url,head,content-type)]} {
+ set URL($url,head,content-type) {}
}
- if {[string compare $URL($url,state) html]} {
- close $sock
- Robot200 $url
- RobotRestart
- } else {
- fileevent $sock readable [list RobotReadBody $url $sock]
+ switch $URL($url,head,content-type) {
+ text/html {
+ fileevent $sock readable [list RobotReadContent $url $sock]
+ }
+ text/plain {
+ fileevent $sock readable [list RobotReadContent $url $sock]
+ }
+ default {
+ close $sock
+ Robot200 $url
+ RobotRestart
+ }
}
}
default {
}
proc RobotConnect {url sock} {
- global URL
+ global URL agent
fconfigure $sock -translation {auto crlf} -blocking 0
puts "Reading $url"
- fileevent $sock readable [list RobotReadHead $url $sock]
+ fileevent $sock readable [list RobotReadHeader $url $sock]
puts $sock "GET $URL($url,path) HTTP/1.0"
puts $sock "Host: $URL($url,host)"
+ puts $sock "User-Agent: $agent"
puts $sock ""
flush $sock
}
}
}
-if {[llength $argv] < 2} {
- puts "Tclrobot: usage <domain> <start>"
- puts " Example: '*.dk' www.indexdata.dk"
+
+set agent "zmbot/0.0"
+if {![catch {set os [exec uname -s -r]}]} {
+ set agent "$agent ($os)"
+ puts "agent: $agent"
+}
+
+proc bgerror {m} {
+ puts "BGERROR $m"
+}
+
+if {0} {
+ proc RobotRestart {} {
+ global robotMoreWork
+ set robotMoreWork 0
+ puts "myrestart"
+ }
+ set robotMoreWork 1
+ set url {http://www.indexdata.dk/zap/}
+ RobotGetUrl $url {}
+ while {$robotMoreWork} {
+ vwait robotMoreWork
+ }
+ puts "-----------"
+ puts $URL($url,buf)
+ puts "-----------"
exit 1
}
+set robotMoreWork 0
set workdir [pwd]
+if {[llength $argv] < 2} {
+ puts "Tclrobot: usage <domain> <start>"
+ puts " Example: '*.indexdata.dk' http://www.indexdata.dk/"
+ exit 1
+}
+
set domains [lindex $argv 0]
set site [lindex $argv 1]
if {[string length $site]} {
- set x [RobotFileOpen unvisited $site /]
- close $x
+ set robotMoreWork 1
+ if [RobotGetUrl $site {}] {
+ set robotMoreWork 0
+ puts "Couldn't process $site"
+ } else {
+ #set x [RobotFileOpen unvisited $site /robots.txt]
+ #RobotFileClose $x
+ }
}
-
-RobotRestart
-vwait forever
+while {$robotMoreWork} {
+ vwait robotMoreWork
+}