Index: openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl
===================================================================
RCS file: /usr/local/cvsroot/openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl,v
diff -u -N -r1.67.2.21 -r1.67.2.22
--- openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl	12 Jan 2017 20:00:23 -0000	1.67.2.21
+++ openacs-4/packages/acs-tcl/tcl/text-html-procs.tcl	15 Jan 2017 19:18:53 -0000	1.67.2.22
@@ -869,8 +869,114 @@
         return {}
     }
 
-    ad_proc ad_sanitize_html {
+    package require struct
+    package require htmlparse    
+    
+    ad_proc ad_dom_fix_html {
         -html:required
+    } {
+        
+        Similar in spirit to the famous Tidy command line utility,
+        this proc takes a piece of possibly invalid markup and returns
+        a 'fixed' version where unopened tags have been closed and
+        attribute specifications have been normalized by transforming them
+        in the form <code>attribute-name="attribute value"</code>. All
+        attributes with an invalid (non-alphanumeric) name will be
+        stripped.<br>
+        <br>        
+        Be aware that every comment and also the possibly present
+        DOCTYPE declaration will be stripped from the markup. Also,
+        most of tag's internal whitespace will be trimmed. This
+        behavior comes from the htmlparse library used in this
+        implementation.
+
+        @author Antonio Pisano
+        
+    } {
+        set tree [::struct::tree]
+
+        
+        catch {::htmlparse::tags destroy}
+
+        ::struct::stack ::htmlparse::tags
+        ::htmlparse::tags push root
+        $tree set root type root
+
+        ::htmlparse::parse \
+            -cmd [list ::htmlparse::2treeCallback $tree] \
+            -incvar errs $html
+
+        $tree walk root -order post n {
+            ::htmlparse::Reorder $tree $n
+        }
+
+        ::htmlparse::tags destroy
+        
+
+        set marker root
+        set lmarker "<$marker>"
+        set rmarker "</$marker>"
+        dom createDocument $marker doc
+        set root [$doc documentElement]
+        
+        set queue {}
+        lappend queue [list $root [$tree children [$tree children root]]]
+        while {$queue ne {}} {
+            lassign [lindex $queue 0] domparent treechildren            
+            set queue [lrange $queue 1 end]
+
+            foreach child $treechildren {
+                set type [$tree get $child type]
+                set data [$tree get $child data]
+                if {$type eq "PCDATA"} {
+                    set el [$doc createTextNode $data]
+                } else {
+                    set el [$doc createElement $type]
+
+                    # parse element attributes
+                    while {$data ne ""} {
+                        set data [string trim $data]
+                        # attribute with a value, optionally surrounded by double or single quotes
+                        if {[regexp "^(\[^= \]+)=(\"\[^\"\]*\"|'\[^'\].*'|\[^ \]*)" $data m attname attvalue]} {
+                            if {[string match "\"*\"" $attvalue] ||
+                                [string match "'*'" $attvalue]} {
+                                set attvalue [string range $attvalue 1 end-1]
+                            }
+                        # attribute with no value
+                        } elseif {[regexp {^([^\s]+)} $data m attname]} {
+                            set attvalue ""
+                        } else {
+                            error "Unrecoverable attribute spec in supplied markup"
+                        }
+
+                        # skip bogus attribute names
+                        if {[string is alnum -strict $attname]} {
+                            $el setAttribute $attname $attvalue
+                        }
+                        
+                        set data [string range $data [string length $m] end]
+                    }
+                }
+
+                $domparent appendChild $el
+
+                set elchildren [$tree children $child]
+                if {$elchildren ne {}} {
+                    lappend queue [list $el $elchildren]
+                }
+            }
+        }
+
+        $tree destroy
+
+        set html [$doc asHTML]
+        set html [string range $html [string length $lmarker] end-[string length $rmarker]]
+
+        return [string trim $html]
+    }
+
+    ad_proc ad_dom_sanitize_html {
+        -html:required
         -allowed_tags
         -allowed_attributes
         -allowed_protocols
@@ -880,6 +986,7 @@
         -no_js:boolean
         -no_outer_urls:boolean
         -validate:boolean
+        -fix:boolean
     } {
 
         Sanitizes HTML by specified criteria, basically removing
@@ -921,6 +1028,13 @@
         stripped markup and just report whether the original one
         respects all the specified requirements.
 
+        @param fix When parsing fails on markup as it is, try to fix
+        it by, for example, closing unclosed tags or normalizing
+        attribute specification. This operation will remove most of
+        plain whitespace into text content of original HTML, toghether
+        with every comment and the eventually present DOCTYPE
+        declaration.
+
         @return sanitized markup or a (0/1) truth value when the
         -validate flag is specified
 
@@ -1031,24 +1145,50 @@
         # wrapping html in an auxiliary root element
         set lmarker "<root>"
         set rmarker "</root>"
-        set html "${lmarker}${html}${rmarker}"        
-        if {[catch {dom parse -html $html doc} errmsg]} {
-            ad_log error "Failed at parsing HTML. Error from tDOM is: $errmsg"
-            return [expr {$validate_p ? 0 : ""}]
+        set html "${lmarker}${html}${rmarker}"
+        
+        if {[catch {
+            dom parse -html $html doc
+        } errmsg]} {
+            if {!$fix_p ||
+                [catch {
+                    set html [ad_fix_html -html $html]
+                    dom parse -html $html doc
+                } errmsg]} {
+                ad_log error "Parsing of the document failed. Reported error: $errmsg"
+                return [expr {$validate_p ? 0 : ""}]
+            }
         }
+        
         $doc documentElement root
         
         set driver_info [util_driver_info]
         set driver_prot [dict get $driver_info proto]
         set driver_host [dict get $driver_info hostname]
         set driver_port [dict get $driver_info port]
-        set system_url [util::join_location \
-                            -proto    $driver_prot \
-                            -hostname $driver_host \
-                            -port     $driver_port]
-        # protocol-relative version of the system url
-        regsub ^$driver_prot:// $system_url {//} system_url_noprot
 
+        ## create a regex clause of possible addresses referring to
+        ## this system
+        set our_locations {}
+
+        # location from conf files
+        set location [util::join_location \
+                          -proto    $driver_prot \
+                          -hostname $driver_host \
+                          -port     $driver_port]
+        set our_location($location) 1
+        regsub {^\w+://} $location {//} location
+        set our_location($location) 1
+
+        # location from connection
+        set location [ad_conn location]
+        set our_location($location) 1                
+        regsub {^\w+://} $location {//} location
+        set our_location($location) 1
+        
+        set our_locations [join [array names our_location] |]
+        ##
+
         set queue [$root childNodes]        
         while {$queue ne {}} {
             set node [lindex $queue 0]
@@ -1088,17 +1228,18 @@
                         
                         set prot ""
                         
-                        # attribute is a URL as per RFC
-                        if {[util::split_location $url prot hostname port]} {
+                        # attribute is a full URL
+                        if {[regexp {^(\w+:)?//(.*)} $url match prot loc]} {
                             if {$no_outer_urls_p} {
-                                # no external urls allowed: we still want
-                                # to allow fully specified urls that refer
-                                # to this server, but we'll transform them
-                                # in a local absolute reference. For all
-                                # others, attribute will be removed
-                                # altogether.
+                                # no external urls allowed: we still
+                                # want to allow fully specified urls
+                                # that refer to this server, but we'll
+                                # transform them in a local absolute
+                                # reference. For all others, attribute
+                                # will be just removed.                                
                                 # - This is ok, points to our system...
-                                if {[regsub ^($system_url|$system_url_noprot) $url {} url]} {
+                                if {[regsub ^($our_locations) $url {} url]} {
+                                    set url /[string trimleft $url "/"]
                                     $node setAttribute $att $url
                                 # ...this is not, points elsewhere!
                                 } else {
Index: openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl
===================================================================
RCS file: /usr/local/cvsroot/openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl,v
diff -u -N -r1.1.2.1 -r1.1.2.2
--- openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl	12 Jan 2017 20:00:23 -0000	1.1.2.1
+++ openacs-4/packages/acs-tcl/tcl/test/text-html-procs.tcl	15 Jan 2017 19:18:54 -0000	1.1.2.2
@@ -6,47 +6,62 @@
 }
 
 
-aa_register_case -cats {api smoke} ad_sanitize_html {
+aa_register_case -cats {api smoke} ad_dom_sanitize_html {
 
     Test if it HTML sanitization works as expected
 
 } {
 
     # - Weird HTML, nonexistent and unclosed tags, '<' and '>' chars:
     #   result should be ok, with '<' and '>' converted to entities
-    lappend test_msgs "Invalid markup with single '<' and '>' chars ok?"
+    lappend test_msgs "Test case 1: invalid markup with single '<' and '>' chars ok"
     lappend test_cases {<noexist>sadsa</noexist> dfsdafs <a> 3 > 2 dfsdfasdfsdfsad  sasasadsasa <    sadASDSA}
-    lappend test_result_trivial {<noexist>sadsa</noexist> dfsdafs <a> 3 &gt; 2 dfsdfasdfsdfsad  sasasadsasa &lt;    sadASDSA</a>}
-    lappend test_result_no_js {<noexist>sadsa</noexist> dfsdafs <a> 3 &gt; 2 dfsdfasdfsdfsad  sasasadsasa &lt;    sadASDSA</a>}
-    lappend test_result_no_outer_urls {<noexist>sadsa</noexist> dfsdafs <a> 3 &gt; 2 dfsdfasdfsdfsad  sasasadsasa &lt;    sadASDSA</a>}
+    lappend test_results_trivial {<noexist>sadsa</noexist> dfsdafs <a> 3 &gt; 2 dfsdfasdfsdfsad  sasasadsasa &lt;    sadASDSA</a>}
+    lappend test_results_no_js {<noexist>sadsa</noexist> dfsdafs <a> 3 &gt; 2 dfsdfasdfsdfsad  sasasadsasa &lt;    sadASDSA</a>}
+    lappend test_results_no_outer_urls {<noexist>sadsa</noexist> dfsdafs <a> 3 &gt; 2 dfsdfasdfsdfsad  sasasadsasa &lt;    sadASDSA</a>}
+    lappend test_results_fixing_markup {<noexist>sadsa</noexist> dfsdafs <a> 3 &gt; 2 dfsdfasdfsdfsad  sasasadsasa &lt;    sadASDSA</a>}
 
     # - Weird HTML, nonexistent and unclosed tags, MULTIPLE '<' and '>' chars:
     #   some loss in translation, multiple '<' and '>' become single ones
-    lappend test_msgs "Invalid markup with multiple '<' and '>' chars ok?"
+    lappend test_msgs "Test case 2: invalid markup with multiple '<' and '>' chars ok"
     lappend test_cases {
         <noexist>sadsa</noexist> dfsdafs <a></a> 3 < 2 dfsdfasdfsdfsad <<<<<<<<<< a <<< a << <<< << sasasadsasa <    sadASDSA
     }
-    lappend test_result_trivial {
+    lappend test_results_trivial {
         <noexist>sadsa</noexist> dfsdafs <a></a> 3 &lt; 2 dfsdfasdfsdfsad &lt; a &lt; a &lt; sasasadsasa &lt;    sadASDSA
     }
-    lappend test_result_no_js {
+    lappend test_results_no_js {
         <noexist>sadsa</noexist> dfsdafs <a></a> 3 &lt; 2 dfsdfasdfsdfsad &lt; a &lt; a &lt; sasasadsasa &lt;    sadASDSA
     }
-    lappend test_result_no_outer_urls {
+    lappend test_results_no_outer_urls {
         <noexist>sadsa</noexist> dfsdafs <a></a> 3 &lt; 2 dfsdfasdfsdfsad &lt; a &lt; a &lt; sasasadsasa &lt;    sadASDSA
     }
+    lappend test_results_fixing_markup {
+        <noexist>sadsa</noexist> dfsdafs <a></a> 3 &lt; 2 dfsdfasdfsdfsad &lt; a &lt; a &lt; sasasadsasa &lt;    sadASDSA
+    }
 
     # - Half opened HTML into other markup: this markup will be completely rejected
-    lappend test_msgs "Invalid unparseable markup ok?"
+    lappend test_msgs "Test case 3: invalid unparseable markup ok"
     lappend test_cases {
         <noexist>sadsa</noexist> dfsdafs <a><tag</a> 3 sadASDSA
     }
-    lappend test_result_trivial {}
-    lappend test_result_no_js {}
-    lappend test_result_no_outer_urls {}
+    lappend test_results_trivial {}
+    lappend test_results_no_js {}
+    lappend test_results_no_outer_urls {}
+    lappend test_results_fixing_markup {}
 
+    # - Formally invalid HTML: this markup will be rejected when the
+    #   fix option is not enabled and parsed otherwise. Internal
+    #   blank space into tags will be lost.
+    lappend test_msgs "Test case 4: formally invalid markup ok"
+    lappend test_cases {<div a %%> fooo <a>}
+    lappend test_results_trivial {}
+    lappend test_results_no_js {}
+    lappend test_results_no_outer_urls {}
+    lappend test_results_fixing_markup "<div a=\"\">fooo<a></a>\n</div>"
+
     # - Plain text: this should stay as it is
-    lappend test_msgs "Plain text ok?"
+    lappend test_msgs "Test case 5: plain text ok"
     set test_case {
         Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed
         do eiusmod tempor incididunt ut labore et dolore magna
@@ -58,49 +73,133 @@
         deserunt mollit anim id est laborum.
     }
     lappend test_cases $test_case
-    lappend test_result_trivial $test_case
-    lappend test_result_no_js $test_case
-    lappend test_result_no_outer_urls $test_case
+    lappend test_results_trivial $test_case
+    lappend test_results_no_js $test_case
+    lappend test_results_no_outer_urls $test_case
+    lappend test_results_fixing_markup $test_case    
 
-    foreach msg $test_msgs test_case $test_cases result_trivial $test_result_trivial result_no_js $test_result_no_js result_no_outer_urls $test_result_no_outer_urls {
-        set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols *]
-        set result [string trim $result] ; set result_trivial [string trim $result_trivial]
-        aa_true $msg [expr {$result eq $result_trivial}]
-        set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -no_js]
-        set result [string trim $result] ; set result_no_js [string trim $result_no_js]
-        aa_true $msg [expr {$result eq $result_no_js}]
-        set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -no_outer_urls]
-        set result [string trim $result] ; set result_no_outer_urls [string trim $result_no_outer_urls]
-        aa_true $msg [expr {$result eq $result_no_outer_urls}]
-    }
+    # Try test cases allowing all kind of markup
+    foreach \
+        msg          $test_msgs \
+        test_case    $test_cases \
+        test_result  $test_results_trivial {
+            set result [ad_dom_sanitize_html -html $test_case \
+                            -allowed_tags * \
+                            -allowed_attributes * \
+                            -allowed_protocols *]
+            set result [string trim $result]
+            set test_result [string trim $test_result]
+            aa_true "$msg trivial?" [expr {$result eq $test_result}]
+        }
 
+    # Try test cases not allowing js
+    foreach \
+        msg          $test_msgs \
+        test_case    $test_cases \
+        test_result  $test_results_no_js {
+            set result [ad_dom_sanitize_html -html $test_case \
+                            -allowed_tags * \
+                            -allowed_attributes * \
+                            -allowed_protocols * \
+                            -no_js]
+            set result [string trim $result]
+            set test_result [string trim $test_result]
+            aa_true "$msg no js?" [expr {$result eq $test_result}]
+        }
+
+    # Try test cases not allowing outer urls
+    foreach \
+        msg          $test_msgs \
+        test_case    $test_cases \
+        test_result  $test_results_no_outer_urls {
+            set result [ad_dom_sanitize_html -html $test_case \
+                            -allowed_tags * \
+                            -allowed_attributes * \
+                            -allowed_protocols * \
+                            -no_outer_urls]
+            set result [string trim $result]
+            set test_result [string trim $test_result]
+            aa_true "$msg no outer urls?" [expr {$result eq $test_result}]
+        }
+
+    # Try test cases fixing markup
+    foreach \
+        msg          $test_msgs \
+        test_case    $test_cases \
+        test_result  $test_results_fixing_markup {
+            set result [ad_dom_sanitize_html -html $test_case \
+                            -allowed_tags * \
+                            -allowed_attributes * \
+                            -allowed_protocols * \
+                            -fix]
+            set result [string trim $result]
+            set test_result [string trim $test_result]
+            aa_true "$msg fixing markup?" [expr {$result eq $test_result}]
+        }
+    
     array set r [util::http::get -url [util::configured_location]]
     set test_case $r(page)
 
-    set msg "In our index page is removing tags ok"
+    set msg "Test case 6: in our index page is removing tags ok"
     set unallowed_tags {div style script}
-    set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_tags $unallowed_tags]
-    set valid_p [ad_sanitize_html -html $result -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_tags $unallowed_tags -validate]
+    set result [ad_dom_sanitize_html -html $test_case \
+                    -allowed_tags * \
+                    -allowed_attributes * \
+                    -allowed_protocols * \
+                    -unallowed_tags $unallowed_tags]
+    set valid_p [ad_dom_sanitize_html -html $result \
+                     -allowed_tags * \
+                     -allowed_attributes * \
+                     -allowed_protocols * \
+                     -unallowed_tags $unallowed_tags \
+                     -validate]
     aa_true "$msg with validate?" $valid_p    
     aa_false $msg? [regexp {<(div|style|script)\s*[^>]*>} $result]
 
     set msg "In our index page is removing attributes ok"
     set unallowed_attributes {id style}
-    set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_attributes $unallowed_attributes]
-    set valid_p [ad_sanitize_html -html $result -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_attributes $unallowed_attributes -validate]
+    set result [ad_dom_sanitize_html -html $test_case \
+                    -allowed_tags * \
+                    -allowed_attributes * \
+                    -allowed_protocols * \
+                    -unallowed_attributes $unallowed_attributes]
+    set valid_p [ad_dom_sanitize_html -html $result \
+                     -allowed_tags * \
+                     -allowed_attributes * \
+                     -allowed_protocols * \
+                     -unallowed_attributes $unallowed_attributes \
+                     -validate]
     aa_true "$msg with validate?" $valid_p
     aa_false $msg? [regexp {<([a-z]\w*)\s+[^>]*(id|style)=".*"[^>]*>} $result]
 
-    set msg "In our index page is removing protocols ok?"
+    set msg "In our index page is removing protocols ok"
     set unallowed_protocols {http javascript https}
-    set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_protocols $unallowed_protocols]
-    set valid_p [ad_sanitize_html -html $result -allowed_tags * -allowed_attributes * -allowed_protocols * -unallowed_protocols $unallowed_protocols -validate]
+    set result [ad_dom_sanitize_html -html $test_case \
+                    -allowed_tags * \
+                    -allowed_attributes * \
+                    -allowed_protocols * \
+                    -unallowed_protocols $unallowed_protocols]
+    set valid_p [ad_dom_sanitize_html -html $result \
+                     -allowed_tags * \
+                     -allowed_attributes * \
+                     -allowed_protocols * \
+                     -unallowed_protocols $unallowed_protocols \
+                     -validate]
     aa_true "$msg with validate?" $valid_p    
     aa_false $msg? [regexp {<([a-z]\w*)\s+[^>]*(href|src|content|action)="(http|javascript):.*"[^>]*>} $result]
 
-    set msg "In our index page is removing outer links ok?"
-    set result [ad_sanitize_html -html $test_case -allowed_tags * -allowed_attributes * -allowed_protocols * -no_outer_urls]
-    set valid_p [ad_sanitize_html -html $result -allowed_tags * -allowed_attributes * -allowed_protocols * -no_outer_urls -validate]
+    set msg "In our index page is removing outer links ok"
+    set result [ad_dom_sanitize_html -html $test_case \
+                    -allowed_tags * \
+                    -allowed_attributes * \
+                    -allowed_protocols * \
+                    -no_outer_urls]
+    set valid_p [ad_dom_sanitize_html -html $result \
+                     -allowed_tags * \
+                     -allowed_attributes * \
+                     -allowed_protocols * \
+                     -no_outer_urls \
+                     -validate]
     aa_true "$msg with validate?" $valid_p    
     aa_false $msg? [regexp {<([a-z]\w*)\s+[^>]*(href|src|content|action)="(http|https|//):.*"[^>]*>} $result]