#!/usr/bin/tclsh
-# $Id: dcdot.tcl,v 1.3 2000/12/08 22:46:53 adam Exp $
+# $Id: dcdot.tcl,v 1.4 2000/12/11 17:11:03 adam Exp $
#
proc RobotRestart {} {
# puts "Got $readCount bytes"
set URL($url,buf) $URL($url,buf)$buffer
- set n [string first \n\n $URL($url,buf)]
+ set n [string first \r\n\r\n $URL($url,buf)]
if {$n > 1} {
+ puts "string first match n = $n"
set code 0
set version {}
set headbuf [string range $URL($url,buf) 0 $n]
- incr n
- incr n
+ incr n 4
set URL($url,buf) [string range $URL($url,buf) $n end]
regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
set lines [split $headbuf \n]
foreach line $lines {
if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
- set URL($url,head,[string tolower $name]) $value
+ set URL($url,head,[string tolower $name]) [string trim $value]
}
}
set URL($url,state) skip
text/plain {
fileevent $sock readable [list RobotReadContent $url $sock]
}
+ application/pdf {
+ puts "ok preceeed with this thingy"
+ fileevent $sock readable [list RobotReadContent $url $sock]
+ }
default {
close $sock
Robot200 $url
proc RobotConnect {url sock} {
global URL agent
- fconfigure $sock -translation {auto crlf} -blocking 0
+ fconfigure $sock -translation {lf crlf} -blocking 0
fileevent $sock readable [list RobotReadHeader $url $sock]
puts $sock "GET $URL($url,path) HTTP/1.0"
puts $sock "Host: $URL($url,host)"
foreach v [array names URL $url,head,*] {
puts "$v = $URL($v)"
}
+ puts "Buffer length is [string length $URL($url,buf)]"
+ set f [open out.pdf w]
+ puts -nonewline $f $URL($url,buf)
+ close $f
}
#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.8 2000/12/10 22:27:48 adam Exp $
+# $Id: robot.tcl,v 1.9 2000/12/11 17:11:03 adam Exp $
#
-proc RobotFileNext1 {area} {
+proc RobotFileNext1 {area lead} {
+ puts "RobotFileNext1 area=$area lead=$lead"
if {[catch {set ns [glob ${area}/*]}]} {
return {}
}
- set off [string first / $area]
- incr off
-
foreach n $ns {
if {[file isfile $n]} {
- if {[string first :.html $n] > 0} {
- return http://[string range $area/ $off end]
- }
- return http://[string range $n $off end]
+ set off [string last / $n]
+ incr off 2
+ return $lead/[string range $n $off end]
}
}
foreach n $ns {
if {[file isdirectory $n]} {
- set sb [RobotFileNext1 $n]
+ set off [string last / $n]
+ incr off 2
+ set sb [RobotFileNext1 $n $lead/[string range $n $off end]]
if {[string length $sb]} {
return $sb
}
proc RobotFileNext {area} {
global robotSeq
+ puts "RobotFileNext robotSeq=$robotSeq"
if {[catch {set ns [glob ${area}/*]}]} {
return {}
}
set n [lindex $ns $robotSeq]
if {![string length $n]} {
+ flush stdout
puts "------------ N E X T R O U N D --------"
set robotSeq -1
- after 30000 RobotFileWait
+ after 60000 RobotFileWait
vwait robotSeq
set n [lindex $ns $robotSeq]
if {![string length $n]} {
+ puts "robotSeq = $robotSeq"
+ puts "ns=$ns"
+ puts "no more work at index"
return {}
}
}
incr robotSeq
- if {[file isfile $n/robots.txt]} {
+ if {[file isfile $n/frobots.txt]} {
puts "ok returning http://[string range $n $off end]/robots.txt"
return http://[string range $n $off end]/robots.txt
} elseif {[file isdirectory $n]} {
- set sb [RobotFileNext1 $n]
+ set sb [RobotFileNext1 $n http://[string range $n $off end]]
if {[string length $sb]} {
return $sb
}
}
+ puts "no more work at end of RobotFileNext n=$n"
+ puts "ns=$ns"
return {}
}
proc RobotFileExist {area host path} {
- set comp [split $area/$host$path /]
- set l [llength $comp]
+ puts "RobotFileExist begin"
+ puts "area=$area host=$host path=$path"
+ set lpath [split $path /]
+ set l [llength $lpath]
incr l -1
- if {![string length [lindex $comp $l]]} {
- set comp [split $area/$host$path:.html /]
- } elseif {[file exists [join $comp /]]} {
- return 1
- } else {
- set comp [split $area/$host$path/:.html /]
- }
- return [file exists [join $comp /]]
+ set t [lindex $lpath $l]
+ incr l -1
+ set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
+ puts "npath=$npath"
+ puts "RobotFileExist end"
+ return [file exists $npath]
}
proc RobotFileUnlink {area host path} {
- set comp [split $area/$host$path /]
+ puts "RobotFileUnlink begin"
+ puts "area=$area host=$host path=$path"
+ set lpath [split $path /]
+ set l [llength $lpath]
+ incr l -1
+ set t [lindex $lpath $l]
+ incr l -1
+ set npath $area/$host[join [lrange $lpath 0 $l] /d]/f$t
+ puts "npath=$npath"
+ set comp [split $npath /]
set l [llength $comp]
incr l -1
- if {![string length [lindex $comp $l]]} {
- set comp [split $area/$host$path:.html /]
- }
if {[catch {exec rm [join $comp /]}]} return
incr l -1
for {set i $l} {$i > 0} {incr i -1} {
if {![catch {glob $path/*}]} return
exec rmdir ./$path
}
+ puts "RobotFileUnlink end"
}
proc RobotFileClose {out} {
set len [llength $comp]
incr len -1
for {set i 0} {$i < $len} {incr i} {
- set d [lindex $comp $i]
+ if {$i > 1} {
+ set d "d[lindex $comp $i]"
+ } else {
+ set d [lindex $comp $i]
+ }
if {[catch {cd ./$d}]} {
exec mkdir $d
+ puts "creating $d"
cd ./$d
if {![string compare $area unvisited] && $i == 1 && $mode == "w"} {
- set out [open robots.txt w]
+ set out [open frobots.txt w]
puts "creating robots.txt in $d"
close $out
}
set d [lindex $comp $len]
if {[string length $d]} {
if {[file isdirectory $d]} {
- set out [open $d/:.html $mode]
+ set out [open $d/f $mode]
+ puts "1"
} else {
- set out [open $d $mode]
+ set out [open f$d $mode]
+ puts "2"
}
} else {
- set out [open :.html $mode]
+ set out [open f $mode]
+ puts "3"
}
cd $orgPwd
#puts "RobotFileStop"
proc headSave {url out} {
global URL
- puts $out {<zmbot>}
if {[info exists URL($url,head,last-modified)]} {
puts $out "<lastmodified>$URL($url,head,last-modified)</lastmodified>"
}
switch -- [lindex $c $i] {
.. {
incr i -2
+ if {$i < 0} {
+ set i 0
+ }
}
. {
incr i -1
puts "Redirecting from $url to $tourl"
set fromurl {}
- catch {
+ if {[RobotFileExist unvisited $URL($url,host) $URL($url,path)]} {
set inf [RobotFileOpen unvisited $URL($url,host) $URL($url,path) r]
set fromurl [gets $inf]
RobotFileClose $inf
}
- RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
+ if {[catch {RobotFileUnlink unvisited $URL($url,host) $URL($url,path)}]} {
+ puts "unlink failed"
+ exit 1
+ }
if {![RobotFileExist bad $URL($url,host) $URL($url,path)]} {
set outf [RobotFileOpen bad $URL($url,host) $URL($url,path)]
puts $outf "URL=$url to $tourl $code"
proc RobotTextHtml {url out} {
global URL
- set head 0
htmlSwitch $URL($url,buf) \
title {
- if {!$head} {
- headSave $url $out
- set head 1
- }
puts $out "<title>$body</title>"
} -nonest meta {
- if {!$head} {
- headSave $url $out
- set head 1
- }
puts -nonewline $out "<meta"
foreach a [array names parm] {
puts -nonewline $out " $a"
puts "no href"
continue
}
- if {!$head} {
- headSave $url $out
- set head 1
- }
if {1} {
set href $parm(href)
if {![RobotHref $url href host path]} continue
}
}
}
- if {!$head} {
- headSave $url $out
- set head 1
- }
- puts $out "</zmbot>"
}
proc RobotsTxt {url} {
proc RobotTextPlain {url out} {
global URL
- headSave $url $out
puts $out "<documentcontent>"
puts $out $URL($url,buf)
puts $out "</documentcontent>"
- puts $out "</meta>"
if {![string compare $URL($url,path) /robots.txt]} {
RobotsTxt $url
puts "Parsing $url"
set out [RobotFileOpen visited $URL($url,host) $URL($url,path)]
+ puts $out "<zmbot>"
+ headSave $url $out
switch $URL($url,head,content-type) {
text/html {
RobotTextHtml $url $out
text/plain {
RobotTextPlain $url $out
}
- default {
- headSave $url $out
- puts $out "</zmbot>"
+ application/pdf {
+ set pdff [open test.pdf w]
+ puts -nonewline $pdff $URL($url,buf)
+ close $pdff
}
}
+ puts $out "</zmbot>"
RobotFileClose $out
# puts "Parsing done"
RobotFileUnlink unvisited $URL($url,host) $URL($url,path)
}
-proc RobotReadContent {url sock} {
+proc RobotReadContent {url sock binary} {
global URL
set buffer [read $sock 16384]
if {$readCount <= 0} {
Robot200 $url
RobotRestart $sock
- } elseif {[string first \0 $buffer] >= 0} {
+ } elseif {!$binary && [string first \0 $buffer] >= 0} {
Robot200 $url
RobotRestart $sock
} else {
proc RobotReadHeader {url sock} {
global URL
+ puts "RobotReadHeader $url"
if {[catch {set buffer [read $sock 2148]}]} {
RobotError $url 404
RobotRestart $sock
# puts "Got $readCount bytes"
set URL($url,buf) $URL($url,buf)$buffer
- set n [string first \n\n $URL($url,buf)]
+ set n [string first \r\n\r\n $URL($url,buf)]
if {$n > 1} {
set code 0
set version {}
set headbuf [string range $URL($url,buf) 0 $n]
- incr n
- incr n
+ incr n 4
set URL($url,buf) [string range $URL($url,buf) $n end]
regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
set lines [split $headbuf \n]
foreach line $lines {
if {[regexp {^([^:]+):[ ]+(.*)} $line x name value]} {
- set URL($url,head,[string tolower $name]) $value
+ set URL($url,head,[string tolower $name]) [string trim $value]
}
}
puts "code = $code"
RobotRedirect $url $URL($url,head,location) 302
RobotRestart $sock
}
- 404 {
- RobotError $url 404
- RobotRestart $sock
- }
- 401 {
- RobotError $url 401
- RobotRestart $sock
- }
200 {
if {![info exists URL($url,head,content-type)]} {
set URL($url,head,content-type) {}
}
+ set binary 0
switch $URL($url,head,content-type) {
- text/html {
- fileevent $sock readable [list RobotReadContent $url $sock]
- }
- text/plain {
- fileevent $sock readable [list RobotReadContent $url $sock]
- }
- default {
- Robot200 $url
- RobotRestart $sock
+ application/pdf {
+ set binary 1
}
}
+ fileevent $sock readable [list RobotReadContent $url $sock $binary]
}
default {
- RobotError $url 404
+ RobotError $url $code
RobotRestart $sock
}
}
proc RobotConnect {url sock} {
global URL agent
- fconfigure $sock -translation {auto crlf} -blocking 0
+ fconfigure $sock -translation {lf crlf} -blocking 0
fileevent $sock readable [list RobotReadHeader $url $sock]
puts $sock "GET $URL($url,path) HTTP/1.0"
puts $sock "Host: $URL($url,host)"
proc RobotGetUrl {url phost} {
global URL
+ flush stdout
puts "---------"
puts $url
if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} {