ad_library { Contains procs used to manipulate chunks of text and html, most notably converting between them. @author Lars Pind (lars@pinds.com) @creation-date 19 July 2000 @cvs-id $Id: text-html-procs.tcl,v 1.109 2018/12/15 17:46:28 gustafn Exp $ } #################### # # text -> HTML # #################### ad_proc -public ad_text_to_html { -no_links:boolean -no_lines:boolean -no_quote:boolean -includes_html:boolean -encode:boolean text } { Converts plaintext to html. Also translates any recognized email addresses or URLs into a hyperlink. @param no_links will prevent it from highlighting @param no_quote will prevent it from HTML-quoting output, so this can be run on semi-HTML input and preserve that formatting. This will also cause spaces/tabs to not be replaced with nbsp's, because this can too easily mess up HTML tags. @param includes_html Set this if the text parameter already contains some HTML which should be preserved. @param encode This will encode international characters into its html equivalent, like "ü" into ü @author Branimir Dolicki (branimir@arsdigita.com) @author Lars Pind (lars@pinds.com) @creation-date 19 July 2000 } { if { $text eq "" } { return "" } set orig_text $text set space_added 0 set nr_links 0 if { !$no_links_p } { # # We start by putting a space in front so our URL/email # highlighting will work for URLs/emails right in the # beginning of the text. # set text " $text" set space_added 1 # if something is " http://" or " https://" or "ftp://" we # assume it is a link to an outside source. # # (bd) The only purpose of the markers is to get rid of # trailing dots, commas and things like that. Note the code # uses utf-8 codes \u0002 (start of text) and \u0003 (end of # text) special chars as marker. Previously, we had \x001 and # \x002, which do not work reliably (regsub was missing some # entries, probably due to a mess-up of the internal # representation). # set nr_links [regsub -nocase -all \ {([^a-zA-Z0-9]+)((http|https|ftp)://[^\(\)\"<>\s]+)} $text \ "\\1\u0002\\2\u0003" text] # email links have the form xxx@xxx.xxx # # JCD: don't treat things =xxx@xxx.xxx as email since most # common occurrence seems to be in URLs (although VPATH bounce # emails like bounce-user=domain.com@sourcehost.com will then # not work correctly). Another tricky case is # http://www.postgresql.org/message-id/20060329203545.M43728@narrowpathinc.com # where we do not want turn the @ into a mailto. incr nr_links [regsub -nocase -all \ {([^a-zA-Z0-9=/.-]+)(mailto:)?([^=\(\)\s:;,@<>/]+@[^\(\)\s.:;,@<>]+[.][^\(\)\s:;,@<>]+)} $text \ "\\1\u0002mailto:\\3\u0003" text] # # Remove marker from URLs that are already HREF=... or SRC=... chunks # if { $includes_html_p && $nr_links > 0} { regsub -nocase -all {((href|src)\s*=\s*['\"]?)\u0002([^\u0003]*)\u0003} $text {\1\3} text } } # At this point, before inserting some of our own <, >, and "'s # we quote the ones entered by the user: if { !$no_quote_p } { set text [ns_quotehtml $text] } if { $encode_p} { set myChars { ª º À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ ¿ } set myHTML { ª º À Á Â Ã Ä Å &Aelig; Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö Ø Ù Ú Û Ü Ý Þ ß à á â ã ä å æ ç è é ê ë ì í î ï ð ñ ò ó ô õ ö ø ù ú û ü ý þ ÿ ¿ } set map {} foreach ch $myChars entity $myHTML { lappend map $ch $entity } set text [string map $map $text] } # Convert line breaks if { !$no_lines_p } { set text [util_convert_line_breaks_to_html -includes_html=$includes_html_p -- $text] # the function strips all leading white space set space_added 0 } if { !$no_quote_p } { # Convert every two spaces to an nbsp regsub -all { } $text "\\\ " text # Convert tabs to four nbsp's regsub -all {\t} $text {\ \ \ \ } text } if { $nr_links > 0} { # # Move the end of the link before any punctuation marks at the # end of the URL. # regsub -all {([\]!?.:;,<>\(\)\}\"'-]+)(\u0003)} $text {\2\1} text # # Convert the marked links and emails into "..." # regsub -all {\u0002([^\u0003]+?)\u0003} $text {\1} text set changed_back [regsub -all {(\u0002|\u0003)} $text {} text] if {$includes_html_p} { # # All markers should be gone now. # # In case we changed something back (means something is # broken in our regexps above), provide a warning, we have # to debug. # if {$changed_back > 0} { ad_log warning "Replaced spurious magic marker in ad_text_to_html, orig:\n$orig_text" } } } if {$space_added} { set text [string range $text 1 end] } return $text } ad_proc -public ad_html_qualify_links { -location -path html } { Convert in the HTML text relative URLs into fully qualified URLs including the host name. It performs the following operations: 1. prepend paths starting with a "/" by the location (protocol and host). 2. prepend paths not starting a "/" by the path, in case it was passed in. Links, which are already fully qualified are not modified. @param location protocol and host (defaults to [ad_url]) @param path optional path to be prepended to paths not starting with a "/" @param html HTML text, in which substitutions should be performed. } { if {![info exists location]} { set location [util_current_location] } # # Make sure, location ends with a "/". # set location "[string trimright $location /]/" # # Protect all full qualified URLs with special characters (one # rule for single quotes, one for double quotes). # regsub -nocase -all \ {(href|src)\s*=\s*'((http|https|ftp|mailto):[^'\"]+)'} $html \ "\\1='\u0001\\2\u0002'" html regsub -nocase -all \ {(href|src)\s*=\s*[\"]((http|https|ftp|mailto):[^'\"]+)[\"]} $html \ "\\1=\"\u0001\\2\u0002\"" html # # If a path is specified, prefix all relative URLs (i.e. not # starting with a slash) with the specified path. # if {[info exists path]} { set path "[string trim $path /]/" regsub -all {(href|src)\s*=\s*['\"]([^/][^\u0001:'\"]+?)['\"]} $html \ "\\1='${location}${path}\\2'" html } # # Prefix every URL starting with a slash by the location. # regsub -nocase -all {(href|src)\s*=\s*['\"]/([^\u0001:'\"]+?)['\"]} $html \ "\\1='${location}\\2'" html # # Remove all protection characters again. # regsub -nocase -all {((href|src)\s*=\s*['\"]?)\u0001([^\u0002]*)\u0002} $html {\1\3} html return $html } ad_proc -public util_convert_line_breaks_to_html { {-includes_html:boolean} text } { Convert line breaks to
and
tags, respectively.
} {
# Remove any leading or trailing whitespace
regsub {^[\s]+} $text {} text
regsub {[\s]+$} $text {} text
# Make sure all line breaks are single \n's
regsub -all {\r\n} $text "\n" text
regsub -all {\r} $text "\n" text
# Remove whitespace before \n's
regsub -all {[ \t]+\n} $text "\n" text
# Wrap P's around paragraphs
regsub -all {([^\n\s])\n\n+([^\n\s])} $text {\1
\2} text
# remove line breaks right before and after HTML tags that will insert a paragraph break themselves
if { $includes_html_p } {
set tags [join { ul ol li blockquote p div table tr td th } |]
regsub -all -nocase "\\s*(?($tags)\\s*\[^>\]*>)\\s*" $text {\1} text
}
# Convert _single_ CRLF's to
's to preserve line breaks
regsub -all {\n} $text "
\n" text
# Add line breaks to P tags
#regsub -all {
Note that the internal syntax table dictates which tags are non-breaking. The syntax table has codes:
ad_parse_html_attributes_upvar
, so you can parse attributes from a string without upvar'ing.
See the documentation for the other proc.
@author Lars Pind (lars@pinds.com)
@creation-date November 10, 2000
} {
if { [info exists attribute_array] } {
upvar $attribute_array attribute_array_var
return [ad_parse_html_attributes_upvar -attribute_array attribute_array_var html pos]
} else {
return [ad_parse_html_attributes_upvar html pos]
}
}
ad_proc ad_parse_html_attributes_upvar {
-attribute_array
html_varname
pos_varname
} {
Parse attributes in an HTML fragment and return them as a list of lists.
Each element of that list is either a single element, if the attribute had no value, or a two-tuple, with the first element being the name of the attribute and the second being the value. The attribute names are all converted to lowercase.
If you don't really care what happens when the same attribute is present twice, you can also use the
attribute_array
argument, and the attributes will be
set there. For attributes without any value, we'll use the empty string.
Example:
set html {<tag foo = bar baz greble=""hello you sucker"" foo='blah' Heres = ' something for you to = "consider" '>} set pos 5 ; # the 'f' in the first 'foo' set attribute_list [ad_parse_html_attributes_upvar -attribute_array attribute_array html pos]
attribute_list
will contain the following:
{foo bar} baz {greble {"hello you sucker"}} {foo blah} {heres { something for you to = "consider" }}
attribute_array
will contain:
attribute_array(foo)='blah' attribute_array(greble)='"hello you sucker"' attribute_array(baz)='' attribute_array(heres)=' something for you to = "consider" '
Won't alter the string passed in .. promise!
We will modify pos_var. Pos_var should point to the first character inside the tag,
after the tag name (we don't care if you let if there's some whitespace before the first attribute)
@param html_varname the name of the variable holding the HTML
fragment. We promise that we won't change the contents of this
variable.
@param pos_varname the name of the variable holding the position
within the html_varname
string from which we should
start. This should point to a character inside the tag, just after
the tag name, and before the first attribute. Note that we will modify this variable.
When this proc is done, this variable will point to the tag-closing >
.
Example:
if the tag is <img src="foo">, pos_varname
should point to either the space between
img
and src
, or the s
in src
.
@param attribute_array This is an alternate way of returning the attributes, if you don't care
about what happens when the same attribute name is defined twice.
@return A list of list holding the attribute names and
values. Each element of that list is either a single element, if the
attribute had no value, or
a two-tuple, with the first element being the name of the attribute and the second being
the value. The attribute names are all converted to lowercase.
@author Lars Pind (lars@pinds.com)
@creation-date November 10, 2000
} {
upvar $html_varname html
upvar $pos_varname i
if { [info exists attribute_array] } {
upvar $attribute_array attribute_array_var
}
# This is where we're going to return the result
set attributes {}
# Loop over the attributes.
# We maintain counter is so that we don't accidentally enter an infinite loop
set count 0
while { $i < [string length $html] && [string index $html $i] ne ">" } {
if { [incr count] > 3000 } {
error "There appears to be a programming bug in ad_parse_html_attributes_upvar: We've entered an infinite loop. We are here: \noffset $i: [string range $html $i $i+60]"
}
if { [string range $html $i $i+1] eq "/>" } {
# This is an XML-style tag ending: <... />
break
}
# This regexp matches an attribute name and an equal sign, if
# present. Also eats whitespace before or after. The \A
# corresponds to ^, except it matches the position we're
# starting from, not the start of the string.
if { ![regexp -indices -start $i {\A\s*([^\s=>]+)\s*(=?)\s*} $html match attr_name_idx equal_sign_idx] } {
#
# Apparently, there's no attribute name here.
# Let's eat all whitespace and lonely equal signs.
#
regexp -indices -start $i {\A[\s=]*} $html match
set i [expr { [lindex $match 1] + 1 }]
} {
set attr_name [string tolower [string range $html [lindex $attr_name_idx 0] [lindex $attr_name_idx 1]]]
# Move past the attribute name just found
set i [expr { [lindex $match 1] + 1}]
# If there is an equal sign, we're expecting the next token to be a value
if { [lindex $equal_sign_idx 1] - [lindex $equal_sign_idx 0] < 0 } {
# No equal sign, no value
lappend attributes [list $attr_name]
if { [info exists attribute_array] } {
set attribute_array_var($attr_name) {}
}
} else {
# is there a single or double quote sign as the first character?
switch -- [string index $html $i] {
{"} { set exp {\A"([^"]*)"\s*} }
{'} { set exp {\A'([^']*)'\s*} }
default { set exp {\A([^\s>]*)\s*} }
}
if { ![regexp -indices -start $i $exp $html match attr_value_idx] } {
# No end quote.
set attr_value [string range $html $i+1 end]
set i [string length $html]
} else {
set attr_value [string range $html [lindex $attr_value_idx 0] [lindex $attr_value_idx 1]]
set i [expr { [lindex $match 1] + 1}]
}
set attr_value [util_expand_entities_ie_style $attr_value]
lappend attributes [list $attr_name $attr_value]
if { [info exists attribute_array] } {
set attribute_array_var($attr_name) $attr_value
}
}
}
}
return $attributes
}
ad_proc ad_html_security_check { html } {
Returns a human-readable explanation if the user has used any HTML
tag other than the ones marked allowed in antispam section of ad.ini.
Otherwise returns an empty string.
@return a human-readable, plaintext explanation of what's wrong with the user's input.
@author Lars Pind (lars@pinds.com)
@creation-date 20 July 2000
} {
if { [string first <% $html] > -1 } {
return "For security reasons, you're not allowed to have the less-than-percent combination in your input."
}
array set allowed_attribute [list]
array set allowed_tag [list]
array set allowed_protocol [list]
# Use the antispam tags for this package instance and whatever is on the kernel.
set allowed_tags_list [concat \
[ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedTag antispam] \
[ad_parameter_all_values_as_list AllowedTag antispam]]
set allowed_attributes_list [concat \
[ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedAttribute antispam] \
[ad_parameter_all_values_as_list AllowedAttribute antispam]]
set allowed_protocols_list [concat \
[ad_parameter_all_values_as_list -package_id [ad_acs_kernel_id] AllowedProtocol antispam] \
[ad_parameter_all_values_as_list AllowedProtocol antispam]]
foreach attribute $allowed_attributes_list {
set allowed_attribute([string tolower $attribute]) 1
}
foreach tagname $allowed_tags_list {
set allowed_tag([string tolower $tagname]) 1
}
foreach protocol $allowed_protocols_list {
set allowed_protocol([string tolower $protocol]) 1
}
# loop over all tags
for { set i [string first < $html] } { $i != -1 } { set i [string first < $html $i] } {
# move past the tag-opening <
incr i
if { ![regexp -indices -start $i {\A/?([-_a-zA-Z0-9]+)\s*} $html match name_idx] } {
# The tag-opener isn't followed by USASCII letters (with or without optional initial slash)
# Not considered a tag. Shouldn't do any harm in browsers.
# (Tested with digits, with A syntax, with whitespace)
} else {
# The tag was valid ... now let's see if it's on the allowed list.
set tagname [string tolower [string range $html [lindex $name_idx 0] [lindex $name_idx 1]]]
if { ![info exists allowed_tag($tagname)] && ![info exists allowed_tag(*)] } {
# Nope, this was a naughty tag.
return "For security reasons we only accept the submission of HTML
containing the following tags: [join $allowed_tags_list " "].
You have a [string toupper $tagname] tag in there."
} else {
# Legal tag.
# Make i point to the first character inside the tag, after the tag name and any whitespace
set i [expr { [lindex $match 1] + 1}]
set attr_list [ad_parse_html_attributes_upvar html i]
foreach attribute $attr_list {
lassign $attribute attr_name attr_value
if { ![info exists allowed_attribute($attr_name)]
&& ![info exists allowed_attribute(*)] } {
return "The attribute '$attr_name' is not allowed for $tagname tags"
}
if { [string tolower $attr_name] ne "style" } {
if { [regexp {^\s*([^\s:]+):\/\/} $attr_value match protocol] } {
if { ![info exists allowed_protocol([string tolower $protocol])]
&& ![info exists allowed_protocol(*)] } {
return "Your URLs can only use these protocols: [join $allowed_protocols_list ", "].
You have a '$protocol' protocol in there."
}
}
}
}
}
}
}
return {}
}
# This was created in order to pre-process some content to be fed
# to tDOM in ad_sanitize_html. In fact, even with its least picky
# behavior, tDOM cannot swallow whatever markup you give it. This
# proc might also be used in order to improve some OpenACS
# routines, like util_close_html_tags. As it has some limitations,
# this is left to future considerations.
ad_proc -public ad_dom_fix_html {
-html:required
{-marker "root"}
-dom:boolean
} {
Similar in spirit to the famous Tidy command line utility,
this proc takes a piece of possibly invalid markup and returns
a 'fixed' version where unopened tags have been closed and
attribute specifications have been normalized by transforming them
in the form attribute-name="attribute value"
. All
attributes with an invalid (non-alphanumeric) name will be
stripped.
Be aware that every comment and also the possibly present
DOCTYPE declaration will be stripped from the markup. Also,
most of tag's internal whitespace will be trimmed. This
behavior comes from the htmlparse library used in this
implementation.
@param html Markup to process
@param marker Root element use to enforce a single root of the
DOM tree.
@param dom When this flag is set, instead of returning markup,
the proc will return the tDOM object built during the
operation. Useful when the result should be used by tDOM
anyway, so we can avoid superfluous parsing.
@return markup or a tDOM document object if the -dom flag is
specified
@author Antonio Pisano
} {
if {[catch {package require struct}]} {
error "Package struct non found on the system"
}
if {[catch {package require htmlparse}]} {
error "Package htmlparse non found on the system"
}
set tree [::struct::tree]
catch {::htmlparse::tags destroy}
::struct::stack ::htmlparse::tags
::htmlparse::tags push root
$tree set root type root
::htmlparse::parse \
-cmd [list ::htmlparse::2treeCallback $tree] \
-incvar errs $html
$tree walk root -order post n {
::htmlparse::Reorder $tree $n
}
::htmlparse::tags destroy
set lmarker "<$marker>"
set rmarker "$marker>"
set doc [dom createDocument $marker]
set root [$doc documentElement]
set queue {}
lappend queue [list $root [$tree children [$tree children root]]]
while {$queue ne {}} {
lassign [lindex $queue 0] domparent treechildren
set queue [lrange $queue 1 end]
foreach child $treechildren {
set type [$tree get $child type]
set data [$tree get $child data]
if {$type eq "PCDATA"} {
set el [$doc createTextNode $data]
} else {
set el [$doc createElement $type]
# parse element attributes
while {$data ne ""} {
set data [string trim $data]
# attribute with a value, optionally surrounded by double or single quotes
if {[regexp "^(\[^= \]+)=(\"\[^\"\]*\"|'\[^'\].*'|\[^ \]*)" $data m attname attvalue]} {
if {[string match "\"*\"" $attvalue] ||
[string match "'*'" $attvalue]} {
set attvalue [string range $attvalue 1 end-1]
}
# attribute with no value
} elseif {[regexp {^([^\s]+)} $data m attname]} {
set attvalue ""
} else {
error "Unrecoverable attribute spec in supplied markup"
}
# skip bogus attribute names
if {[string is alnum -strict $attname]} {
$el setAttribute $attname $attvalue
}
set data [string range $data [string length $m] end]
}
}
$domparent appendChild $el
set elchildren [$tree children $child]
if {$elchildren ne {}} {
lappend queue [list $el $elchildren]
}
}
}
$tree destroy
if {$dom_p} {
return $doc
} else {
set html [$doc asHTML]
$doc delete
set html [string range $html [string length $lmarker] end-[string length $rmarker]]
}
return [string trim $html]
}
# Original purpose of this proc was to introduce a better way to
# enforce some HTML policies on the content submitted by the uses
# (e.g. forbid some tag/attribute like