#!/usr/bin/tclsh
-# $Id: robot.tcl,v 1.29 2001/11/14 09:15:23 adam Exp $
+# $Id: robot.tcl,v 1.30 2002/02/17 09:29:18 adam Exp $
#
proc RobotFileNext1 {area lead} {
# puts "RobotFileNext1 area=$area lead=$lead"
proc RobotTextHtml {url out} {
global URL maxdistance
+ # set title so we can emit it for the body
+ set title {}
+ # if true, nothing will be indexed
+ set noindex 0
+ # if true, nothing will be followed
+ set nofollow 0
+
set distance 0
set fdistance 0
if {$maxdistance < 1000 && [info exists URL($url,dist)]} {
}
htmlSwitch $URL($url,buf) \
title {
- puts $out "<title>$body</title>"
+ set title $body
} -nonest meta {
+ # collect metadata and save NAME= CONTENT=..
+ set metaname {}
+ set metacontent {}
puts -nonewline $out "<meta"
foreach a [array names parm] {
- puts -nonewline $out " $a"
+ set al [string tolower $a]
+ puts -nonewline $out " $al"
puts -nonewline $out {="}
puts -nonewline $out $parm($a)
puts -nonewline $out {"}
+ switch -- $al {
+ "name" {
+ set metaname [string tolower $parm($a)]
+ }
+ "content" {
+ set metacontent $parm($a)
+ }
+ }
+ }
+ puts $out "></meta>"
+ # go through robots directives (af any)
+ if {![string compare $metaname robots]} {
+ set direcs [split [string tolower $metacontent] ,]
+ if {[lsearch $direcs noindex] >= 0} {
+ set noindex 1
+ }
+ if {[lsearch $direcs nofollow] >= 0} {
+ set nofollow 1
+ }
}
- puts $out {></meta>}
} body {
- regsub -all {<!--[^-]*->} $body { } abody
- regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
- regsub -all {<[^\>]+>} $bbody {} nbody
- puts $out "<documentcontent>"
- puts $out $nbody
- puts $out "</documentcontent>"
+ # don't print title of document content if noindex is used
+ if {!$noindex} {
+ puts $out "<title>$title</title>"
+ regsub -all {<!--[^-]*->} $body { } abody
+ regsub -all -nocase {<script[^<]*</script>} $abody {} bbody
+ regsub -all {<[^\>]+>} $bbody {} nbody
+ puts $out "<documentcontent>"
+ puts $out $nbody
+ puts $out "</documentcontent>"
+ }
} -nonest base {
+ # <base href=.. >
if {![info exists parm(href)]} {
continue
}
set href [string trim $parm(href)]
if {![RobotHref $url href host path]} continue
set URL($url,bpath) $path
- } -nonest a {
+ } a {
+ # <a href="...."> .. </a>
+ # we're not using nonest - otherwise body isn't set
+ if {$nofollow} continue
if {![info exists parm(href)]} {
continue
}
link $url $out [string trim $parm(href)] $body $distance
} -nonest area {
+ if {$nofollow} continue
if {![info exists parm(href)]} {
continue
}