Index: openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl,v diff -u -N -r1.67.2.21 -r1.67.2.22 --- openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl 12 Jan 2017 20:00:23 -0000 1.67.2.21 +++ openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl 15 Jan 2017 19:18:53 -0000 1.67.2.22 @@ -869,8 +869,114 @@ return {} } - ad_proc ad_sanitize_html { + package require struct + package require htmlparse + + ad_proc ad_dom_fix_html { -html:required + } { + + Similar in spirit to the famous Tidy command line utility, + this proc takes a piece of possibly invalid markup and returns + a 'fixed' version where unopened tags have been closed and + attribute specifications have been normalized by transforming them + in the form attribute-name="attribute value". All + attributes with an invalid (non-alphanumeric) name will be + stripped.
+
+ Be aware that every comment and also the possibly present + DOCTYPE declaration will be stripped from the markup. Also, + most of tag's internal whitespace will be trimmed. This + behavior comes from the htmlparse library used in this + implementation. + + @author Antonio Pisano + + } { + set tree [::struct::tree] + + + catch {::htmlparse::tags destroy} + + ::struct::stack ::htmlparse::tags + ::htmlparse::tags push root + $tree set root type root + + ::htmlparse::parse \ + -cmd [list ::htmlparse::2treeCallback $tree] \ + -incvar errs $html + + $tree walk root -order post n { + ::htmlparse::Reorder $tree $n + } + + ::htmlparse::tags destroy + + + set marker root + set lmarker "<$marker>" + set rmarker "" + dom createDocument $marker doc + set root [$doc documentElement] + + set queue {} + lappend queue [list $root [$tree children [$tree children root]]] + while {$queue ne {}} { + lassign [lindex $queue 0] domparent treechildren + set queue [lrange $queue 1 end] + + foreach child $treechildren { + set type [$tree get $child type] + set data [$tree get $child data] + if {$type eq "PCDATA"} { + set el [$doc createTextNode $data] + } else { + set el [$doc createElement $type] + + # parse element attributes + while {$data ne ""} { + set data [string trim $data] + # attribute with a value, optionally surrounded by double or single quotes + if {[regexp "^(\[^= \]+)=(\"\[^\"\]*\"|'\[^'\].*'|\[^ \]*)" $data m attname attvalue]} { + if {[string match "\"*\"" $attvalue] || + [string match "'*'" $attvalue]} { + set attvalue [string range $attvalue 1 end-1] + } + # attribute with no value + } elseif {[regexp {^([^\s]+)} $data m attname]} { + set attvalue "" + } else { + error "Unrecoverable attribute spec in supplied markup" + } + + # skip bogus attribute names + if {[string is alnum -strict $attname]} { + $el setAttribute $attname $attvalue + } + + set data [string range $data [string length $m] end] + } + } + + $domparent appendChild $el + + set elchildren [$tree children $child] + if {$elchildren ne {}} { + lappend queue [list $el $elchildren] + } + } + } + + $tree destroy + + set html [$doc asHTML] + set html [string range $html [string length $lmarker] end-[string length $rmarker]] + + return [string trim $html] + } + + ad_proc ad_dom_sanitize_html { + -html:required -allowed_tags -allowed_attributes -allowed_protocols @@ -880,6 +986,7 @@ -no_js:boolean -no_outer_urls:boolean -validate:boolean + -fix:boolean } { Sanitizes HTML by specified criteria, basically removing @@ -921,6 +1028,13 @@ stripped markup and just report whether the original one respects all the specified requirements. + @param fix When parsing fails on markup as it is, try to fix + it by, for example, closing unclosed tags or normalizing + attribute specification. This operation will remove most of + plain whitespace into text content of original HTML, toghether + with every comment and the eventually present DOCTYPE + declaration. + @return sanitized markup or a (0/1) truth value when the -validate flag is specified @@ -1031,24 +1145,50 @@ # wrapping html in an auxiliary root element set lmarker "" set rmarker "" - set html "${lmarker}${html}${rmarker}" - if {[catch {dom parse -html $html doc} errmsg]} { - ad_log error "Failed at parsing HTML. Error from tDOM is: $errmsg" - return [expr {$validate_p ? 0 : ""}] + set html "${lmarker}${html}${rmarker}" + + if {[catch { + dom parse -html $html doc + } errmsg]} { + if {!$fix_p || + [catch { + set html [ad_fix_html -html $html] + dom parse -html $html doc + } errmsg]} { + ad_log error "Parsing of the document failed. Reported error: $errmsg" + return [expr {$validate_p ? 0 : ""}] + } } + $doc documentElement root set driver_info [util_driver_info] set driver_prot [dict get $driver_info proto] set driver_host [dict get $driver_info hostname] set driver_port [dict get $driver_info port] - set system_url [util::join_location \ - -proto $driver_prot \ - -hostname $driver_host \ - -port $driver_port] - # protocol-relative version of the system url - regsub ^$driver_prot:// $system_url {//} system_url_noprot + ## create a regex clause of possible addresses referring to + ## this system + set our_locations {} + + # location from conf files + set location [util::join_location \ + -proto $driver_prot \ + -hostname $driver_host \ + -port $driver_port] + set our_location($location) 1 + regsub {^\w+://} $location {//} location + set our_location($location) 1 + + # location from connection + set location [ad_conn location] + set our_location($location) 1 + regsub {^\w+://} $location {//} location + set our_location($location) 1 + + set our_locations [join [array names our_location] |] + ## + set queue [$root childNodes] while {$queue ne {}} { set node [lindex $queue 0] @@ -1088,17 +1228,18 @@ set prot "" - # attribute is a URL as per RFC - if {[util::split_location $url prot hostname port]} { + # attribute is a full URL + if {[regexp {^(\w+:)?//(.*)} $url match prot loc]} { if {$no_outer_urls_p} { - # no external urls allowed: we still want - # to allow fully specified urls that refer - # to this server, but we'll transform them - # in a local absolute reference. For all - # others, attribute will be removed - # altogether. + # no external urls allowed: we still + # want to allow fully specified urls + # that refer to this server, but we'll + # transform them in a local absolute + # reference. For all others, attribute + # will be just removed. # - This is ok, points to our system... - if {[regsub ^($system_url|$system_url_noprot) $url {} url]} { + if {[regsub ^($our_locations) $url {} url]} { + set url /[string trimleft $url "/"] $node setAttribute $att $url # ...this is not, points elsewhere! } else { Index: openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl,v diff -u -N -r1.1.2.1 -r1.1.2.2 --- openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl 12 Jan 2017 20:00:23 -0000 1.1.2.1 +++ openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl 15 Jan 2017 19:18:54 -0000 1.1.2.2 @@ -6,47 +6,62 @@ } -aa_register_case -cats {api smoke} ad_sanitize_html { +aa_register_case -cats {api smoke} ad_dom_sanitize_html { Test if it HTML sanitization works as expected } { # - Weird HTML, nonexistent and unclosed tags, '<' and '>' chars: # result should be ok, with '<' and '>' converted to entities - lappend test_msgs "Invalid markup with single '<' and '>' chars ok?" + lappend test_msgs "Test case 1: invalid markup with single '<' and '>' chars ok" lappend test_cases {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA} - lappend test_result_trivial {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA} - lappend test_result_no_js {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA} - lappend test_result_no_outer_urls {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA} + lappend test_results_trivial {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA} + lappend test_results_no_js {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA} + lappend test_results_no_outer_urls {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA} + lappend test_results_fixing_markup {sadsa dfsdafs 3 > 2 dfsdfasdfsdfsad sasasadsasa < sadASDSA} # - Weird HTML, nonexistent and unclosed tags, MULTIPLE '<' and '>' chars: # some loss in translation, multiple '<' and '>' become single ones - lappend test_msgs "Invalid markup with multiple '<' and '>' chars ok?" + lappend test_msgs "Test case 2: invalid markup with multiple '<' and '>' chars ok" lappend test_cases { sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad <<<<<<<<<< a <<< a << <<< << sasasadsasa < sadASDSA } - lappend test_result_trivial { + lappend test_results_trivial { sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad < a < a < sasasadsasa < sadASDSA } - lappend test_result_no_js { + lappend test_results_no_js { sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad < a < a < sasasadsasa < sadASDSA } - lappend test_result_no_outer_urls { + lappend test_results_no_outer_urls { sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad < a < a < sasasadsasa < sadASDSA } + lappend test_results_fixing_markup { + sadsa dfsdafs 3 < 2 dfsdfasdfsdfsad < a < a < sasasadsasa < sadASDSA + } # - Half opened HTML into other markup: this markup will be completely rejected - lappend test_msgs "Invalid unparseable markup ok?" + lappend test_msgs "Test case 3: invalid unparseable markup ok" lappend test_cases { sadsa dfsdafs 3 sadASDSA } - lappend test_result_trivial {} - lappend test_result_no_js {} - lappend test_result_no_outer_urls {} + lappend test_results_trivial {} + lappend test_results_no_js {} + lappend test_results_no_outer_urls {} + lappend test_results_fixing_markup {} + # - Formally invalid HTML: this markup will be rejected when the + # fix option is not enabled and parsed otherwise. Internal + # blank space into tags will be lost. + lappend test_msgs "Test case 4: formally invalid markup ok" + lappend test_cases {
fooo } + lappend test_results_trivial {} + lappend test_results_no_js {} + lappend test_results_no_outer_urls {} + lappend test_results_fixing_markup "
fooo\n
" + # - Plain text: this should stay as it is - lappend test_msgs "Plain text ok?" + lappend test_msgs "Test case 5: plain text ok" set test_case { Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna @@ -58,49 +73,133 @@ deserunt mollit anim id est laborum. } lappend test_cases $test_case - lappend test_result_trivial $test_case - lappend test_result_no_js $test_case - lappend test_result_no_outer_urls $test_case + lappend test_results_trivial $test_case + lappend test_results_no_js $test_case + lappend test_results_no_outer_urls $test_case + lappend test_results_fixing_markup $test_case - foreach msg $test_msgs test_case $test_cases result_trivial $test_result_trivial result_no_js $test_result_no_js result_no_outer_urls $test_result_no_outer_urls { - set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols *] - set result [string trim $result] ; set result_trivial [string trim $result_trivial] - aa_true $msg [expr {$result eq $result_trivial}] - set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -no_js] - set result [string trim $result] ; set result_no_js [string trim $result_no_js] - aa_true $msg [expr {$result eq $result_no_js}] - set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -no_outer_urls] - set result [string trim $result] ; set result_no_outer_urls [string trim $result_no_outer_urls] - aa_true $msg [expr {$result eq $result_no_outer_urls}] - } + # Try test cases allowing all kind of markup + foreach \ + msg $test_msgs \ + test_case $test_cases \ + test_result $test_results_trivial { + set result [ad_dom_sanitize_html -html $test_case \ + -allowed_tags * \ + -allowed_attributes * \ + -allowed_protocols *] + set result [string trim $result] + set test_result [string trim $test_result] + aa_true "$msg trivial?" [expr {$result eq $test_result}] + } + # Try test cases not allowing js + foreach \ + msg $test_msgs \ + test_case $test_cases \ + test_result $test_results_no_js { + set result [ad_dom_sanitize_html -html $test_case \ + -allowed_tags * \ + -allowed_attributes * \ + -allowed_protocols * \ + -no_js] + set result [string trim $result] + set test_result [string trim $test_result] + aa_true "$msg no js?" [expr {$result eq $test_result}] + } + + # Try test cases not allowing outer urls + foreach \ + msg $test_msgs \ + test_case $test_cases \ + test_result $test_results_no_outer_urls { + set result [ad_dom_sanitize_html -html $test_case \ + -allowed_tags * \ + -allowed_attributes * \ + -allowed_protocols * \ + -no_outer_urls] + set result [string trim $result] + set test_result [string trim $test_result] + aa_true "$msg no outer urls?" [expr {$result eq $test_result}] + } + + # Try test cases fixing markup + foreach \ + msg $test_msgs \ + test_case $test_cases \ + test_result $test_results_fixing_markup { + set result [ad_dom_sanitize_html -html $test_case \ + -allowed_tags * \ + -allowed_attributes * \ + -allowed_protocols * \ + -fix] + set result [string trim $result] + set test_result [string trim $test_result] + aa_true "$msg fixing markup?" [expr {$result eq $test_result}] + } + array set r [util::http::get -url [util::configured_location]] set test_case $r(page) - set msg "In our index page is removing tags ok" + set msg "Test case 6: in our index page is removing tags ok" set unallowed_tags {div style script} - set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_tags $unallowed_tags] - set valid_p [ad_sanitize_html -html $result -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_tags $unallowed_tags -validate] + set result [ad_dom_sanitize_html -html $test_case \ + -allowed_tags * \ + -allowed_attributes * \ + -allowed_protocols * \ + -unallowed_tags $unallowed_tags] + set valid_p [ad_dom_sanitize_html -html $result \ + -allowed_tags * \ + -allowed_attributes * \ + -allowed_protocols * \ + -unallowed_tags $unallowed_tags \ + -validate] aa_true "$msg with validate?" $valid_p aa_false $msg? [regexp {<(div|style|script)\s*[^>]*>} $result] set msg "In our index page is removing attributes ok" set unallowed_attributes {id style} - set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_attributes $unallowed_attributes] - set valid_p [ad_sanitize_html -html $result -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_attributes $unallowed_attributes -validate] + set result [ad_dom_sanitize_html -html $test_case \ + -allowed_tags * \ + -allowed_attributes * \ + -allowed_protocols * \ + -unallowed_attributes $unallowed_attributes] + set valid_p [ad_dom_sanitize_html -html $result \ + -allowed_tags * \ + -allowed_attributes * \ + -allowed_protocols * \ + -unallowed_attributes $unallowed_attributes \ + -validate] aa_true "$msg with validate?" $valid_p aa_false $msg? [regexp {<([a-z]\w*)\s+[^>]*(id|style)=".*"[^>]*>} $result] - set msg "In our index page is removing protocols ok?" + set msg "In our index page is removing protocols ok" set unallowed_protocols {http javascript https} - set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_protocols $unallowed_protocols] - set valid_p [ad_sanitize_html -html $result -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_protocols $unallowed_protocols -validate] + set result [ad_dom_sanitize_html -html $test_case \ + -allowed_tags * \ + -allowed_attributes * \ + -allowed_protocols * \ + -unallowed_protocols $unallowed_protocols] + set valid_p [ad_dom_sanitize_html -html $result \ + -allowed_tags * \ + -allowed_attributes * \ + -allowed_protocols * \ + -unallowed_protocols $unallowed_protocols \ + -validate] aa_true "$msg with validate?" $valid_p aa_false $msg? [regexp {<([a-z]\w*)\s+[^>]*(href|src|content|action)="(http|javascript):.*"[^>]*>} $result] - set msg "In our index page is removing outer links ok?" - set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -no_outer_urls] - set valid_p [ad_sanitize_html -html $result -allowed_tags * -allowed_attributes * -allowed_protocols * -no_outer_urls -validate] + set msg "In our index page is removing outer links ok" + set result [ad_dom_sanitize_html -html $test_case \ + -allowed_tags * \ + -allowed_attributes * \ + -allowed_protocols * \ + -no_outer_urls] + set valid_p [ad_dom_sanitize_html -html $result \ + -allowed_tags * \ + -allowed_attributes * \ + -allowed_protocols * \ + -no_outer_urls \ + -validate] aa_true "$msg with validate?" $valid_p aa_false $msg? [regexp {<([a-z]\w*)\s+[^>]*(href|src|content|action)="(http|https|//):.*"[^>]*>} $result]