Index: openacs.org-dev/packages/acs-tcl/tcl/text-html-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs.org-dev/packages/acs-tcl/tcl/text-html-procs.tcl,v diff -u -r1.6 -r1.7 --- openacs.org-dev/packages/acs-tcl/tcl/text-html-procs.tcl 23 Sep 2003 13:15:27 -0000 1.6 +++ openacs.org-dev/packages/acs-tcl/tcl/text-html-procs.tcl 23 Sep 2003 13:19:45 -0000 1.7 @@ -33,7 +33,10 @@ @author Lars Pind (lars@pinds.com) @creation-date 19 July 2000 } { - + if { [empty_string_p $text] } { + return {} + } + if { !$no_links_p } { # We start by putting a space in front so our URL/email highlighting will work # for URLs/emails right in the beginning of the text. @@ -103,7 +106,6 @@ return $text } - ad_proc -public util_convert_line_breaks_to_html { {-includes_html:boolean} text @@ -122,7 +124,7 @@ regsub -all {[ \t]*\n} $text "\n" text # Wrap P's around paragraphs - set text "

$text

" + set text "

$text

" regsub -all {([^\n\s])\n\n([^\n\s])} $text {\1

\2} text # Convert _single_ CRLF's to
's to preserve line breaks @@ -146,77 +148,7 @@ } -ad_proc -public ad_text_to_html_old { - -no_links:boolean - text -} { - Converts plaintext to html. Also translates any recognized - email addresses or URLs into a hyperlink. - @param no_links will prevent it from highlighting - - @author Branimir Dolicki (branimir@arsdigita.com) - @author Lars Pind (lars@pinds.com) - @creation-date 19 July 2000 -} { - - if { !$no_links_p } { - # We start by putting a space in front so our URL/email highlighting will work - # for URLs/emails right in the beginning of the text. - set text " $text" - - # if something is " http://" or " https://" - # we assume it is a link to an outside source. - - # (bd) The only purpose of thiese sTaRtUrL and - # eNdUrL markers is to get rid of trailing dots, - # commas and things like that. Note that there - # is a TAB before and after each marker. - - regsub -nocase -all {([^a-zA-Z0-9]+)(http://[^\(\)"<>\s]+)} $text "\\1\tsTaRtUrL\\2eNdUrL\t" text - regsub -nocase -all {([^a-zA-Z0-9]+)(https://[^\(\)"<>\s]+)} $text "\\1\tsTaRtUrL\\2eNdUrL\t" text - regsub -nocase -all {([^a-zA-Z0-9]+)(ftp://[^\(\)"<>\s]+)} $text "\\1\tsTaRtUrL\\2eNdUrL\t" text - - # email links have the form xxx@xxx.xxx - regsub -nocase -all {([^a-zA-Z0-9]+)([^\(\)\s:;,@<>]+@[^\(\)\s.:;,@<>]+[.][^\(\)\s:;,@<>]+)} $text \ - "\\1\tsTaRtEmAiL\\2eNdEmAiL\t" text - - } - - # At this point, before inserting some of our own <, >, and "'s - # we quote the ones entered by the user: - set text [ad_quotehtml $text] - - # Convert _single_ CRLF's to
's to preserve line breaks - regsub -all {\r*\n} $text "
" text - - # Convert every two spaces to an nbsp - regsub -all { } $text "\\\  " text - - # turn CRLFCRLF into

- if { [regsub -all {\r\n\s*\r\n} $text "

" text] == 0 } { - # try LFLF - if { [regsub -all {\n\s*\n} $text "

" text] == 0 } { - # try CRCR - regsub -all {\r\s*\r} $text "

" text - } - } - - if { !$no_links_p } { - # Dress the links and emails with A HREF - regsub -all {([]!?.:;,<>\(\)\}"'-]+)(eNdUrL\t)} $text {\2\1} text - regsub -all {([]!?.:;,<>\(\)\}"'-]+)(eNdEmAiL\t)} $text {\2\1} text - regsub -all {\tsTaRtUrL([^\t]*)eNdUrL\t} $text {\1} text - regsub -all {\tsTaRtEmAiL([^\t]*)eNdEmAiL\t} $text {\1} text - set text [string trimleft $text] - } - - # Convert every tab to 4 nbsp's - regsub -all {\t} $text {\ \ \ \ } text - - return $text -} - ad_proc -public ad_quotehtml { arg } { Quotes ampersands, double-quotes, and angle brackets in $arg. @@ -484,14 +416,17 @@ # we should now walk the stack and close any open tags. for {set i $tagptr} { $i > -1 } {incr i -1} { - # append out " " - append out "" + set tag $tagstack($i) + + # LARS: Only close tags which we aren't supposed to remove + if { ![string equal $syn($tag) "discard"] && ![string equal $syn($tag) "remove"] } { + append out "" + } } return $out } - ad_proc ad_parse_html_attributes { -attribute_array html @@ -718,9 +653,9 @@ if { ![info exists allowed_tag($tagname)] } { # Nope, this was a naughty tag. - return "

For security reasons we only accept the submission of HTML - containing the following tags:

[join $allowed_tags_list " "]
- You have a <$tagname> tag in there.

" + return "For security reasons we only accept the submission of HTML + containing the following tags: [join $allowed_tags_list " "]. + You have a <$tagname> tag in there." } else { # Legal tag. @@ -760,6 +695,7 @@ # HTML -> Text # #################### + ad_proc -public ad_html_to_text { {-maxlen 70} {-showtags:boolean} @@ -798,6 +734,15 @@ for { set i [string first < $html] } { $i != -1 } { set i [string first < $html $i] } { # append everything up to and not including the tag-opening < ad_html_to_text_put_text output [string range $html $last_tag_end [expr {$i - 1}]] + + # Check that tag doesn't start with whitespace, in which case it's just a lone < that + # was errorneously left unquoted + if { $i >= [expr $length-1] || ![string is alpha [string index $html [expr $i + 1]]] } { + # Output the < and continue with next character + ad_html_to_text_put_text output "<" + set last_tag_end [incr i] + continue + } # we're inside a tag now. Find the end of it @@ -837,7 +782,7 @@ } set string_delimiter [string index $html $string_delimiter_idx] - # If the greeater than sign appears before any of the string delimters, we've found the tag end. + # If the greater than sign appears before any of the string delimters, we've found the tag end. if { $gt_idx < $string_delimiter_idx || $string_delimiter_idx == -1 } { # we found the tag end set i $gt_idx @@ -856,7 +801,7 @@ set full_tag [string range $html $tag_start [expr { $i - 1 }]] - if { ![regexp {(/?)([^\s]+)[\s]*(\s.*)?} $full_tag match slash tagname attributes] } { + if { ![regexp {^(/?)([^\s]+)[\s]*(\s.*)?$} $full_tag match slash tagname attributes] } { # A malformed tag -- just delete it } else { @@ -946,7 +891,7 @@ ad_html_to_text_put_text output \" } img { - if { [empty_string_p $slash] } { + if { [empty_string_p $slash] && !$no_format_p } { set img_info {} if { [info exists attribute_array(alt)] } { lappend img_info "'$attribute_array(alt)'" @@ -979,8 +924,8 @@ # Close any unclosed tags set output(pre) 0 while { $output(blockquote) > 0 } { - incr $output(blockquote) -1 - incr $output(maxlen) 4 + incr output(blockquote) -1 + incr output(maxlen) 4 } # write out URLs, if necessary: @@ -991,235 +936,6 @@ return $output(text) } -ad_proc -public ad_html_to_text_old { - {-maxlen 70} - {-showtags:boolean} - html -} { - Returns a best-guess plain text version of an HTML fragment. - Parses the HTML and does some simple formatting. The parser and - formatting - is pretty stupid, but it's better than nothing. - - @param maxlen the line length you want your output wrapped to. - @param showtags causes any unknown (and uninterpreted) tags to get shown in the output. - - @author Lars Pind (lars@pinds.com) - @author Aaron Swartz (aaron@swartzfam.com) - @creation-date 19 July 2000 -} { - set output(text) {} - set output(linelen) 0 - set output(maxlen) $maxlen - set output(pre) 0 - set output(p) 0 - set output(br) 0 - set output(space) 0 - set output(blockquote) 0 - - set length [string length $html] - set last_tag_end 0 - - # For showing the URL of links. - set href_urls [list] - set href_stack [list] - - for { set i [string first < $html] } { $i != -1 } { set i [string first < $html $i] } { - # append everything up to and not including the tag-opening < - ad_html_to_text_put_text output [string range $html $last_tag_end [expr {$i - 1}]] - - # we're inside a tag now. Find the end of it - - # make i point to the char after the < - incr i - set tag_start $i - - set count 0 - while 1 { - if { [incr count] > 100 } { - error "There appears to be a programming bug in ad_html_to_text: We've entered an infinite loop." - } - # Find the positions of the first quote, apostrophe and greater-than sign. - set quote_idx [string first \" $html $i] - set apostrophe_idx [string first ' $html $i] - set gt_idx [string first > $html $i] - - # If there is no greater-than sign, then the tag isn't closed. - if { $gt_idx == -1 } { - set i $length - break - } - - # Find the first of the quote and the apostrophe - if { $apostrophe_idx == -1 } { - set string_delimiter_idx $quote_idx - } else { - if { $quote_idx == -1 } { - set string_delimiter_idx $apostrophe_idx - } else { - if { $apostrophe_idx < $quote_idx } { - set string_delimiter_idx $apostrophe_idx - } else { - set string_delimiter_idx $quote_idx - } - } - } - set string_delimiter [string index $html $string_delimiter_idx] - - # If the greeater than sign appears before any of the string delimters, we've found the tag end. - if { $gt_idx < $string_delimiter_idx || $string_delimiter_idx == -1 } { - # we found the tag end - set i $gt_idx - break - } - - # Otherwise, we'll have to skip past the ending string delimiter - set i [string first $string_delimiter $html [incr string_delimiter_idx]] - if { $i == -1 } { - # Missing string end delimiter - set i $length - break - } - incr i - } - - set full_tag [string range $html $tag_start [expr { $i - 1 }]] - - if { ![regexp {(/?)([^\s]+)[\s]*(\s.*)?} $full_tag match slash tagname attributes] } { - # A malformed tag -- just delete it - } else { - - # Reset/create attribute array - array unset attribute_array - - # Parse the attributes - ad_parse_html_attributes -attribute_array attribute_array $attributes - - switch -- [string tolower $tagname] { - p - ul - ol - table { - set output(p) 1 - } - br { - ad_html_to_text_put_newline output - } - tr - td - th { - set output(br) 1 - } - h1 - h2 - h3 - h4 - h5 - h6 { - set output(p) 1 - if { [empty_string_p $slash] } { - ad_html_to_text_put_text output [string repeat "*" [string index $tagname 1]] - } - } - li { - set output(br) 1 - if { [empty_string_p $slash] } { - ad_html_to_text_put_text output "- " - } - } - strong - b { - ad_html_to_text_put_text output "*" - } - em - i - cite - u { - ad_html_to_text_put_text output "_" - } - a { - if { [empty_string_p $slash] } { - if { [info exists attribute_array(href)] } { - if { [info exists attribute_array(title)] } { - set title ": '$attribute_array(title)'" - } else { - set title "" - } - set href_no [expr [llength $href_urls] + 1] - lappend href_urls "\[$href_no\] $attribute_array(href) " - lappend href_stack "\[$href_no$title\]" - } elseif { [info exists attribute_array(title)] } { - lappend href_stack "\[$attribute_array(title)\]" - } else { - lappend href_stack {} - } - } else { - if { [llength $href_stack] > 0 } { - if { ![empty_string_p [lindex $href_stack end]] } { - ad_html_to_text_put_text output [lindex $href_stack end] - } - set href_stack [lreplace $href_stack end end] - } - } - } - pre { - set output(p) 1 - if { [empty_string_p $slash] } { - incr output(pre) - } else { - incr output(pre) -1 - } - } - blockquote { - set output(p) 1 - if { [empty_string_p $slash] } { - incr output(blockquote) - incr output(maxlen) -4 - } else { - incr output(blockquote) -1 - incr output(maxlen) 4 - } - } - hr { - set output(p) 1 - ad_html_to_text_put_text output [string repeat "-" $output(maxlen)] - set output(p) 1 - } - q { - ad_html_to_text_put_text output \" - } - img { - if { [empty_string_p $slash] } { - set img_info {} - if { [info exists attribute_array(alt)] } { - lappend img_info "'$attribute_array(alt)'" - } - if { [info exists attribute_array(src)] } { - lappend img_info $attribute_array(src) - } - if { [llength $img_info] == 0 } { - ad_html_to_text_put_text output {[IMAGE]} - } else { - ad_html_to_text_put_text output "\[IMAGE: [join $img_info " "]\]" - } - } - } - default { - # Other tag - if { $showtags_p } { - ad_html_to_text_put_text output "<$slash$tagname$attributes>" - } - } - } - } - - # set end of last tag to the character following the > - set last_tag_end [incr i] - } - # append everything after the last tag - ad_html_to_text_put_text output [string range $html $last_tag_end end] - - # Close any unclosed tags - set output(pre) 0 - while { $output(blockquote) > 0 } { - incr $output(blockquote) -1 - incr $output(maxlen) 4 - } - - # write out URLs, if necessary: - if { [llength $href_urls] > 0 } { - append output(text) "\n\n[join $href_urls "\n"]" - } - - return $output(text) -} - ad_proc -private ad_html_to_text_put_newline { output_var } { Helper proc for ad_html_to_text @@ -1360,7 +1076,7 @@ } { regsub -all {<} $html {<} html regsub -all {>} $html {>} html - regsub -all {"} $html {"} html + regsub -all {"} $html "\"" html regsub -all {—} $html {--} html regsub -all {—} $html {--} html # Need to do the & last, because otherwise it could interfere with the other expansions, @@ -1504,8 +1220,9 @@ #################### ad_proc -public ad_html_text_convert { - {-from text} - {-to html} + {-from text/plain} + {-to text/html} + {-maxlen 70} text } { Converts a chunk of text from text/html to text/html. @@ -1522,46 +1239,91 @@ @author Lars Pind (lars@pinds.com) @creation-date 19 July 2000 } { + set valid_froms { text/enhanced text/plain text/fixed-width text/html } + set valid_tos { text/plain text/html } + + # Validate procedure input + set from [ad_decode $from "html" "text/html" "text" "text/plain" "plain" "text/plain" $from] + if { [lsearch $valid_froms $from] == -1 } { + error "Unknown text input format, '$from'. Valid formats are $valid_froms." + } + + set to [ad_decode $to "html" "text/html" "text" "text/plain" "plain" "text/plain" $to] + if { [lsearch $valid_tos $to] == -1 } { + error "Unknown text input format, '$to'. Valid formats are $valid_tos." + } + + # Do the conversion switch $from { - text/html - - html { + text/enhanced { switch $to { - text/html - - html { - ad_html_security_check $text - return [util_close_html_tags $text] + text/html { + set text [ad_enhanced_text_to_html $text] } - text/plain - - text { - return [ad_html_to_text -- $text] + text/plain { + set text [ad_enhanced_text_to_plain_text $text] } - default { - return -code error "Can only convert to text or html" + } + } + text/plain { + switch $to { + text/html { + set text [ad_text_to_html -- $text] } + text/plain { + set text [wrap_string $text $maxlen] + } } - } - text/plain - - text { + } + text/fixed-width { switch $to { - text/html - - html { - return [ad_text_to_html -- $text] + text/html { + set text "
[ad_text_to_html -no_lines -- $text]
" } - text/plain - - text { - return [wrap_string $text 70] + text/plain { + set text [wrap_string $text $maxlen] } - default { - return -code error "Can only convert to text or html" + } + } + text/html { + switch $to { + text/html { + set text [util_close_html_tags $text] } + text/plain { + set text [ad_html_to_text -maxlen $maxlen -- $text] + } } } - default { - return -code error "Can only convert from text or html" - } } + + return $text } +ad_proc -public ad_enhanced_text_to_html { + text +} { + Converts enhanced text format to normal HTML. + @author Lars Pind (lars@pinds.com) + @creation-date 2003-01-27 +} { + return [ad_text_to_html -no_quote -includes_html -- [util_close_html_tags $text]] +} + +ad_proc -public ad_enhanced_text_to_plain_text { + {-maxlen 70} + text +} { + Converts enhanced text format to normal plaintext format. + @author Lars Pind (lars@pinds.com) + @creation-date 2003-01-27 +} { + # Convert the HTML version to plaintext. + return [ad_html_to_text -maxlen $maxlen -- [ad_enhanced_text_to_html $text]] +} + + + ad_proc -public ad_convert_to_html { {-html_p f} text @@ -1640,6 +1402,8 @@ ad_proc string_truncate { {-len 200} {-format html} + {-no_format:boolean} + {-ellipsis "..."} string } { Truncates a string to len characters (defaults to the @@ -1650,6 +1414,7 @@ @param len The lenght to truncate to. Defaults to parameter TruncateDescriptionLength. @param format html or text. + @param no_format causes hyperlink tags not to get listed at the end of the output. @param string The string to truncate. @return The truncated string, with HTML tags cloosed or converted to text, depending on format. @@ -1658,13 +1423,17 @@ @creation-date September 8, 2002 } { if { [string length $string] > $len } { - set string "[string range $string 0 $len]..." + set string "[string range $string 0 [expr $len-[string length $ellipsis]-1]]$ellipsis" } - if { [string equal $format "html"] } { + if { [string equal $format "html"] && !$no_format_p } { set string [util_close_html_tags $string] } else { - set string [ad_html_to_text -- $string] + if { $no_format_p } { + set string [ad_html_to_text -no_format $string] + } else { + set string [ad_html_to_text -- $string] + } } return $string } @@ -1679,7 +1448,9 @@ ad_proc -deprecated util_striphtml {html} { - Use ad_html_to_text instead. + Deprecated. Use ad_html_to_text instead. + + @see ad_html_to_text } { return [ad_html_to_text -- $html] } @@ -1700,6 +1471,7 @@ plaintext. We'd rather let the user change our opinion about the text, e.g. html_p = 't'. + @see ad_text_to_html } { if { [regexp -nocase {

} $raw_string] || [regexp -nocase {
} $raw_string] } { # user was already trying to do this as HTML @@ -1715,6 +1487,8 @@ href="/api-doc/proc-view?proc=ad_convert_to_html">ad_convert_to_html instead. + @see ad_convert_to_html + } { if { $html_p == "t" } { return $raw_string @@ -1726,20 +1500,26 @@ ad_proc -deprecated util_quotehtml { arg } { This proc does exactly the same as ad_quotehtml. Use that instead. This one will be deleted eventually. + + @see ad_quotehtml } { return [ad_quotehtml $arg] } ad_proc -deprecated util_quote_double_quotes {arg} { This proc does exactly the same as ad_quotehtml. Use that instead. This one will be deleted eventually. + + @see ad_quotehtml } { return [ad_quotehtml $arg] } ad_proc -deprecated philg_quote_double_quotes {arg} { This proc does exactly the same as ad_quotehtml. Use that instead. This one will be deleted eventually. + + @see ad_quotehtml } { return [ad_quotehtml $arg] }