2 # $Id: dcdot.tcl,v 1.5 2003/01/13 13:59:07 adam Exp $
11 proc RobotTextHtml {url} {
16 catch {unset $URL($url,meta)}
17 while {[regexp -nocase -indices $e $b i]} {
18 set meta [string range $b [lindex $i 0] [lindex $i 1]]
19 lappend URL($url,meta) $meta
20 set b [string range $b [lindex $i 1] end]
24 catch {unset $URL($url,meta)}
25 while {[regexp -nocase -indices $e $b i]} {
26 set title [string range $b [lindex $i 0] [lindex $i 1]]
27 lappend URL($url,title) $title
28 set b [string range $b [lindex $i 1] end]
36 switch $URL($url,head,content-type) {
44 proc RobotReadContent {url sock} {
47 set buffer [read $sock 16384]
48 set readCount [string length $buffer]
50 if {$readCount <= 0} {
55 # puts "Got $readCount bytes"
56 set URL($url,buf) $URL($url,buf)$buffer
60 proc RobotReadHeader {url sock} {
63 set buffer [read $sock 2148]
64 set readCount [string length $buffer]
66 if {$readCount <= 0} {
70 # puts "Got $readCount bytes"
71 set URL($url,buf) $URL($url,buf)$buffer
73 set n [string first \r\n\r\n $URL($url,buf)]
75 puts "string first match n = $n"
78 set headbuf [string range $URL($url,buf) 0 $n]
80 set URL($url,buf) [string range $URL($url,buf) $n end]
82 regexp {^HTTP/([0-9.]+)[ ]+([0-9]+)} $headbuf x version code
83 set lines [split $headbuf \n]
85 if {[regexp {^([^:]+):[ ]+([^;]*)} $line x name value]} {
86 set URL($url,head,[string tolower $name]) [string trim $value]
89 set URL($url,state) skip
93 if {![info exists URL($url,head,content-type)]} {
94 set URL($url,head,content-type) {}
96 switch $URL($url,head,content-type) {
98 fileevent $sock readable [list RobotReadContent $url $sock]
101 fileevent $sock readable [list RobotReadContent $url $sock]
104 puts "ok preceeed with this thingy"
105 fileevent $sock readable [list RobotReadContent $url $sock]
123 proc RobotConnect {url sock} {
126 fconfigure $sock -translation {lf crlf} -blocking 0
127 fileevent $sock readable [list RobotReadHeader $url $sock]
128 puts $sock "GET $URL($url,path) HTTP/1.0"
129 puts $sock "Host: $URL($url,host)"
130 puts $sock "User-Agent: $agent"
135 proc RobotGetUrl {url phost} {
137 if {![regexp {([^:]+)://([^/]+)([^ ]*)} $url x method hostport path]} {
140 if {![regexp {([^:]+):([0-9]+)} $hostport x host port]} {
144 set URL($url,method) $method
145 set URL($url,host) $host
146 set URL($url,port) $port
147 set URL($url,path) $path
148 set URL($url,state) head
150 if [catch {set sock [socket -async $host $port]}] {
153 RobotConnect $url $sock
158 set agent "dcdot.tcl/0.0"
159 if {![catch {set os [exec uname -s -r]}]} {
160 set agent "$agent ($os)"
163 proc RobotGetDCDOT {url} {
164 global robotMoreWork 1
167 if [RobotGetUrl $url {}] {
171 while {$robotMoreWork} {
177 set url [lindex $argv 0]
180 if {[info exist URL($url,meta)]} {
181 foreach m $URL($url,meta) {
185 if {[info exist URL($url,title)]} {
186 foreach m $URL($url,title) {
190 foreach v [array names URL $url,head,*] {
193 puts "Buffer length is [string length $URL($url,buf)]"
194 set f [open out.pdf w]
195 puts -nonewline $f $URL($url,buf)