Index: openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl
===================================================================
RCS file: /usr/local/cvsroot/openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl,v
diff -u -N -r1.67.2.21 -r1.67.2.22
--- openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl 12 Jan 2017 20:00:23 -0000 1.67.2.21
+++ openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl 15 Jan 2017 19:18:53 -0000 1.67.2.22
@@ -869,8 +869,114 @@
return {}
}
- ad_proc ad_sanitize_html {
+ package require struct
+ package require htmlparse
+
+ ad_proc ad_dom_fix_html {
-html:required
+ } {
+
+ Similar in spirit to the famous Tidy command line utility,
+ this proc takes a piece of possibly invalid markup and returns
+ a 'fixed' version where unopened tags have been closed and
+ attribute specifications have been normalized by transforming them
+ in the form attribute-name="attribute value"
. All
+ attributes with an invalid (non-alphanumeric) name will be
+ stripped.
+
+ Be aware that every comment and also the possibly present
+ DOCTYPE declaration will be stripped from the markup. Also,
+ most of tag's internal whitespace will be trimmed. This
+ behavior comes from the htmlparse library used in this
+ implementation.
+
+ @author Antonio Pisano
+
+ } {
+ set tree [::struct::tree]
+
+
+ catch {::htmlparse::tags destroy}
+
+ ::struct::stack ::htmlparse::tags
+ ::htmlparse::tags push root
+ $tree set root type root
+
+ ::htmlparse::parse \
+ -cmd [list ::htmlparse::2treeCallback $tree] \
+ -incvar errs $html
+
+ $tree walk root -order post n {
+ ::htmlparse::Reorder $tree $n
+ }
+
+ ::htmlparse::tags destroy
+
+
+ set marker root
+ set lmarker "<$marker>"
+ set rmarker "$marker>"
+ dom createDocument $marker doc
+ set root [$doc documentElement]
+
+ set queue {}
+ lappend queue [list $root [$tree children [$tree children root]]]
+ while {$queue ne {}} {
+ lassign [lindex $queue 0] domparent treechildren
+ set queue [lrange $queue 1 end]
+
+ foreach child $treechildren {
+ set type [$tree get $child type]
+ set data [$tree get $child data]
+ if {$type eq "PCDATA"} {
+ set el [$doc createTextNode $data]
+ } else {
+ set el [$doc createElement $type]
+
+ # parse element attributes
+ while {$data ne ""} {
+ set data [string trim $data]
+ # attribute with a value, optionally surrounded by double or single quotes
+ if {[regexp "^(\[^= \]+)=(\"\[^\"\]*\"|'\[^'\].*'|\[^ \]*)" $data m attname attvalue]} {
+ if {[string match "\"*\"" $attvalue] ||
+ [string match "'*'" $attvalue]} {
+ set attvalue [string range $attvalue 1 end-1]
+ }
+ # attribute with no value
+ } elseif {[regexp {^([^\s]+)} $data m attname]} {
+ set attvalue ""
+ } else {
+ error "Unrecoverable attribute spec in supplied markup"
+ }
+
+ # skip bogus attribute names
+ if {[string is alnum -strict $attname]} {
+ $el setAttribute $attname $attvalue
+ }
+
+ set data [string range $data [string length $m] end]
+ }
+ }
+
+ $domparent appendChild $el
+
+ set elchildren [$tree children $child]
+ if {$elchildren ne {}} {
+ lappend queue [list $el $elchildren]
+ }
+ }
+ }
+
+ $tree destroy
+
+ set html [$doc asHTML]
+ set html [string range $html [string length $lmarker] end-[string length $rmarker]]
+
+ return [string trim $html]
+ }
+
+ ad_proc ad_dom_sanitize_html {
+ -html:required
-allowed_tags
-allowed_attributes
-allowed_protocols
@@ -880,6 +986,7 @@
-no_js:boolean
-no_outer_urls:boolean
-validate:boolean
+ -fix:boolean
} {
Sanitizes HTML by specified criteria, basically removing
@@ -921,6 +1028,13 @@
stripped markup and just report whether the original one
respects all the specified requirements.
+ @param fix When parsing fails on markup as it is, try to fix
+ it by, for example, closing unclosed tags or normalizing
+ attribute specification. This operation will remove most of
+ plain whitespace into text content of original HTML, toghether
+ with every comment and the eventually present DOCTYPE
+ declaration.
+
@return sanitized markup or a (0/1) truth value when the
-validate flag is specified
@@ -1031,24 +1145,50 @@
# wrapping html in an auxiliary root element
set lmarker ""
set rmarker ""
- set html "${lmarker}${html}${rmarker}"
- if {[catch {dom parse -html $html doc} errmsg]} {
- ad_log error "Failed at parsing HTML. Error from tDOM is: $errmsg"
- return [expr {$validate_p ? 0 : ""}]
+ set html "${lmarker}${html}${rmarker}"
+
+ if {[catch {
+ dom parse -html $html doc
+ } errmsg]} {
+ if {!$fix_p ||
+ [catch {
+ set html [ad_fix_html -html $html]
+ dom parse -html $html doc
+ } errmsg]} {
+ ad_log error "Parsing of the document failed. Reported error: $errmsg"
+ return [expr {$validate_p ? 0 : ""}]
+ }
}
+
$doc documentElement root
set driver_info [util_driver_info]
set driver_prot [dict get $driver_info proto]
set driver_host [dict get $driver_info hostname]
set driver_port [dict get $driver_info port]
- set system_url [util::join_location \
- -proto $driver_prot \
- -hostname $driver_host \
- -port $driver_port]
- # protocol-relative version of the system url
- regsub ^$driver_prot:// $system_url {//} system_url_noprot
+ ## create a regex clause of possible addresses referring to
+ ## this system
+ set our_locations {}
+
+ # location from conf files
+ set location [util::join_location \
+ -proto $driver_prot \
+ -hostname $driver_host \
+ -port $driver_port]
+ set our_location($location) 1
+ regsub {^\w+://} $location {//} location
+ set our_location($location) 1
+
+ # location from connection
+ set location [ad_conn location]
+ set our_location($location) 1
+ regsub {^\w+://} $location {//} location
+ set our_location($location) 1
+
+ set our_locations [join [array names our_location] |]
+ ##
+
set queue [$root childNodes]
while {$queue ne {}} {
set node [lindex $queue 0]
@@ -1088,17 +1228,18 @@
set prot ""
- # attribute is a URL as per RFC
- if {[util::split_location $url prot hostname port]} {
+ # attribute is a full URL
+ if {[regexp {^(\w+:)?//(.*)} $url match prot loc]} {
if {$no_outer_urls_p} {
- # no external urls allowed: we still want
- # to allow fully specified urls that refer
- # to this server, but we'll transform them
- # in a local absolute reference. For all
- # others, attribute will be removed
- # altogether.
+ # no external urls allowed: we still
+ # want to allow fully specified urls
+ # that refer to this server, but we'll
+ # transform them in a local absolute
+ # reference. For all others, attribute
+ # will be just removed.
# - This is ok, points to our system...
- if {[regsub ^($system_url|$system_url_noprot) $url {} url]} {
+ if {[regsub ^($our_locations) $url {} url]} {
+ set url /[string trimleft $url "/"]
$node setAttribute $att $url
# ...this is not, points elsewhere!
} else {
Index: openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl
===================================================================
RCS file: /usr/local/cvsroot/openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl,v
diff -u -N -r1.1.2.1 -r1.1.2.2
--- openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl 12 Jan 2017 20:00:23 -0000 1.1.2.1
+++ openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl 15 Jan 2017 19:18:54 -0000 1.1.2.2
@@ -6,47 +6,62 @@
}
-aa_register_case -cats {api smoke} ad_sanitize_html {
+aa_register_case -cats {api smoke} ad_dom_sanitize_html {
Test if it HTML sanitization works as expected
} {
# - Weird HTML, nonexistent and unclosed tags, '<' and '>' chars:
# result should be ok, with '<' and '>' converted to entities
- lappend test_msgs "Invalid markup with single '<' and '>' chars ok?"
+ lappend test_msgs "Test case 1: invalid markup with single '<' and '>' chars ok"
lappend test_cases {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA}
- lappend test_result_trivial {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA}
- lappend test_result_no_js {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA}
- lappend test_result_no_outer_urls {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA}
+ lappend test_results_trivial {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA}
+ lappend test_results_no_js {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA}
+ lappend test_results_no_outer_urls {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA}
+ lappend test_results_fixing_markup {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA}
# - Weird HTML, nonexistent and unclosed tags, MULTIPLE '<' and '>' chars:
# some loss in translation, multiple '<' and '>' become single ones
- lappend test_msgs "Invalid markup with multiple '<' and '>' chars ok?"
+ lappend test_msgs "Test case 2: invalid markup with multiple '<' and '>' chars ok"
lappend test_cases {
sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad <<<<<<<<<< a <<< a << <<< << sasasadsasa < sadASDSA
}
- lappend test_result_trivial {
+ lappend test_results_trivial {
sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad < a < a < sasasadsasa < sadASDSA
}
- lappend test_result_no_js {
+ lappend test_results_no_js {
sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad < a < a < sasasadsasa < sadASDSA
}
- lappend test_result_no_outer_urls {
+ lappend test_results_no_outer_urls {
sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad < a < a < sasasadsasa < sadASDSA
}
+ lappend test_results_fixing_markup {
+ sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad < a < a < sasasadsasa < sadASDSA
+ }
# - Half opened HTML into other markup: this markup will be completely rejected
- lappend test_msgs "Invalid unparseable markup ok?"
+ lappend test_msgs "Test case 3: invalid unparseable markup ok"
lappend test_cases {
sadsa dfsdafs 3 sadASDSA
}
- lappend test_result_trivial {}
- lappend test_result_no_js {}
- lappend test_result_no_outer_urls {}
+ lappend test_results_trivial {}
+ lappend test_results_no_js {}
+ lappend test_results_no_outer_urls {}
+ lappend test_results_fixing_markup {}
+ # - Formally invalid HTML: this markup will be rejected when the
+ # fix option is not enabled and parsed otherwise. Internal
+ # blank space into tags will be lost.
+ lappend test_msgs "Test case 4: formally invalid markup ok"
+ lappend test_cases {