#!/usr/bin/env tclsh
# $Id: link-checker.xotcl,v 1.5 2006/09/27 08:12:39 neumann Exp $
# -gn july 2000
package require XOTcl 1; namespace import -force xotcl::*
@ @File {
description {
A simple link checking program that checks in parallel
pages of a site.
Options:
-url | Start-URL |
-foreign | 0 or 1, specifies, whether foreign links of
local pages should be checked (default 1) |
-local | A string match pattern to decide
which url should be treated as local
e.g. -local *wu-wien.ac.at/*
Per default the locality filter ist set
to the name of the host followed by '/*' |
-restrict | 0 or 1, sets the locality filter to the subtree
implied by the URL |
-verbose | 0 or 1 or 2, verbosity level (default 0) |
}
}
if {$tcl_version<8.2} {
puts stderr "This script requires Tcl 8.2 or newer"
exit -1
}
set opt(-url) http://localhost:8000/
set opt(-url) http://nm.wu-wien.ac.at/Lehre/
set opt(-verbose) 0; # 0, 1 (show check), or 2 (show ignore)
set opt(-foreign) 1; # 0, 1 (check foreign links on local pages)
set opt(-restrict) 0; # 0, 1
## per default, lc checks the
array set opt $argv
if {$opt(-restrict)} {
regexp {://(.*)$} $opt(-url) _ opt(-local)
set opt(-local) [string trimright $opt(-local) /]*
puts stderr "locality filter set to '$opt(-local)'"
}
if {![info exists opt(-local)]} {
regexp {http://([^/:]+)} $opt(-url) _ opt(-local)
append opt(-local) /*
puts stderr "locality filter set to '$opt(-local)'"
}
#package require xotcl::package; package verbose 1
package require xotcl::comm::httpAccess
package require xotcl::trace
proc printError {m} {}
Class Checker -superclass ParallelSink \
-parameter {verbose foreign local}
Checker array set ref {A HREF IMG SRC}
Checker set ref_re {[[:space:]]*=[[:space:]]*([[:graph:]]+)}
Checker instproc report {msg {level 1}} {
my instvar verbose
if {$verbose>$level} {puts stderr $msg}
return 0
}
Checker instproc isLocal {url} {
my instvar local
string match *://$local $url
}
Checker instproc isToCheck {url request methodvar} {
my instvar foreign
upvar $methodvar method
if {![regexp -nocase {http://([^/:]+)} $url _ host]} {
return [my report "ignored, no http: $url"]
}
set method GET
if {![my isLocal $url]} {
if {$foreign} {
#puts stderr "parenturl: [$request set parentUrl] -> [my isLocal [$request set parentUrl]]"
if {[$request info vars parentUrl] ne "" &&
![my isLocal [$request set parentUrl]]} {
return [my report "ignored, nor local: $url"]
} else {
set method HEAD
}
} else {
return [my report "ignored, nor local: $url"]
}
}
if {[regexp -nocase {[.](gif|jpg|ps|pdf|gz)$} $url]} {
set method HEAD
#return [my report "ignored due to extension: $url"]
}
return 1
}
Checker instproc checkLink {request link} {
set resolved [resolve $link [$request set url]]
if {[my isToCheck $resolved $request method]} {
my instvar checked
if {![info exists checked($resolved)]} {
my report "checking .......... $resolved" 0
set checked($resolved) 1
my scheduleRequest $method $resolved [$request set url]
} else {
#puts stderr "already checked $resolved"
}
}
}
Checker instproc checkText {request} {
if {![my isLocal [$request set url]]} return
[self class] instvar ref ref_re
set content [$request getContent]
set start 0
while {[regexp -nocase -indices -start $start -- \
{<(A|IMG)([^>]*?)} $content a b c]} {
set elem [string toupper \
[string range $content [lindex $b 0] [lindex $b 1]]]
set attribs [string range $content [lindex $c 0] [lindex $c 1]]
#regsub -all {[\n ]+} $attribs " " attribs
if {[regexp -nocase $ref($elem)$ref_re $attribs _ i]} {
my checkLink $request [string trim $i '\"]
}
set start [lindex $c 1]
}
}
Checker instproc endCb r {
#showObj $r
switch [$r set contentType] {
text/html {my checkText $r}
}
next
}
Checker instproc cancelCb r {
#$r showVars
puts stderr "ERROR in page [$r set parentUrl]"
puts stderr " Link: [$r set url]"
puts stderr " cause [$r set errormsg]\n"
next
}
Checker csink \
-verbose $opt(-verbose) -foreign $opt(-foreign) -local $opt(-local) \
-sinkClass MemorySink -httpVersion 1.0 -maxsimultaneous 30
csink requests $opt(-url)
puts stderr "sumbytes: [csink set sumbytes] requests: [csink set numrequests]"
csink destroy