Index: openacs.org-dev/packages/acs-tcl/tcl/text-html-procs.tcl =================================================================== RCS file: /usr/local/cvsroot/openacs.org-dev/packages/acs-tcl/tcl/text-html-procs.tcl,v diff -u -r1.5 -r1.6 --- openacs.org-dev/packages/acs-tcl/tcl/text-html-procs.tcl 23 Sep 2003 13:07:12 -0000 1.5 +++ openacs.org-dev/packages/acs-tcl/tcl/text-html-procs.tcl 23 Sep 2003 13:15:27 -0000 1.6 @@ -760,10 +760,10 @@ # HTML -> Text # #################### - ad_proc -public ad_html_to_text { {-maxlen 70} {-showtags:boolean} + {-no_format:boolean} html } { Returns a best-guess plain text version of an HTML fragment. @@ -773,7 +773,237 @@ @param maxlen the line length you want your output wrapped to. @param showtags causes any unknown (and uninterpreted) tags to get shown in the output. + @param no_format causes hyperlink tags not to get listed at the end of the output. + + @author Lars Pind (lars@pinds.com) + @author Aaron Swartz (aaron@swartzfam.com) + @creation-date 19 July 2000 +} { + set output(text) {} + set output(linelen) 0 + set output(maxlen) $maxlen + set output(pre) 0 + set output(p) 0 + set output(br) 0 + set output(space) 0 + set output(blockquote) 0 + set length [string length $html] + set last_tag_end 0 + + # For showing the URL of links. + set href_urls [list] + set href_stack [list] + + for { set i [string first < $html] } { $i != -1 } { set i [string first < $html $i] } { + # append everything up to and not including the tag-opening < + ad_html_to_text_put_text output [string range $html $last_tag_end [expr {$i - 1}]] + + # we're inside a tag now. Find the end of it + + # make i point to the char after the < + incr i + set tag_start $i + + set count 0 + while 1 { + if { [incr count] > 100 } { + error "There appears to be a programming bug in ad_html_to_text: We've entered an infinite loop." + } + # Find the positions of the first quote, apostrophe and greater-than sign. + set quote_idx [string first \" $html $i] + set apostrophe_idx [string first ' $html $i] + set gt_idx [string first > $html $i] + + # If there is no greater-than sign, then the tag isn't closed. + if { $gt_idx == -1 } { + set i $length + break + } + + # Find the first of the quote and the apostrophe + if { $apostrophe_idx == -1 } { + set string_delimiter_idx $quote_idx + } else { + if { $quote_idx == -1 } { + set string_delimiter_idx $apostrophe_idx + } else { + if { $apostrophe_idx < $quote_idx } { + set string_delimiter_idx $apostrophe_idx + } else { + set string_delimiter_idx $quote_idx + } + } + } + set string_delimiter [string index $html $string_delimiter_idx] + + # If the greeater than sign appears before any of the string delimters, we've found the tag end. + if { $gt_idx < $string_delimiter_idx || $string_delimiter_idx == -1 } { + # we found the tag end + set i $gt_idx + break + } + + # Otherwise, we'll have to skip past the ending string delimiter + set i [string first $string_delimiter $html [incr string_delimiter_idx]] + if { $i == -1 } { + # Missing string end delimiter + set i $length + break + } + incr i + } + + set full_tag [string range $html $tag_start [expr { $i - 1 }]] + + if { ![regexp {(/?)([^\s]+)[\s]*(\s.*)?} $full_tag match slash tagname attributes] } { + # A malformed tag -- just delete it + } else { + + # Reset/create attribute array + array unset attribute_array + + # Parse the attributes + ad_parse_html_attributes -attribute_array attribute_array $attributes + + switch -- [string tolower $tagname] { + p - ul - ol - table { + set output(p) 1 + } + br { + ad_html_to_text_put_newline output + } + tr - td - th { + set output(br) 1 + } + h1 - h2 - h3 - h4 - h5 - h6 { + set output(p) 1 + if { [empty_string_p $slash] } { + ad_html_to_text_put_text output [string repeat "*" [string index $tagname 1]] + } + } + li { + set output(br) 1 + if { [empty_string_p $slash] } { + ad_html_to_text_put_text output "- " + } + } + strong - b { + ad_html_to_text_put_text output "*" + } + em - i - cite - u { + ad_html_to_text_put_text output "_" + } + a { + if { [empty_string_p $slash] && !$no_format_p} { + if { [info exists attribute_array(href)] } { + if { [info exists attribute_array(title)] } { + set title ": '$attribute_array(title)'" + } else { + set title "" + } + set href_no [expr [llength $href_urls] + 1] + lappend href_urls "\[$href_no\] $attribute_array(href) " + lappend href_stack "\[$href_no$title\]" + } elseif { [info exists attribute_array(title)] } { + lappend href_stack "\[$attribute_array(title)\]" + } else { + lappend href_stack {} + } + } else { + if { [llength $href_stack] > 0 } { + if { ![empty_string_p [lindex $href_stack end]] } { + ad_html_to_text_put_text output [lindex $href_stack end] + } + set href_stack [lreplace $href_stack end end] + } + } + } + pre { + set output(p) 1 + if { [empty_string_p $slash] } { + incr output(pre) + } else { + incr output(pre) -1 + } + } + blockquote { + set output(p) 1 + if { [empty_string_p $slash] } { + incr output(blockquote) + incr output(maxlen) -4 + } else { + incr output(blockquote) -1 + incr output(maxlen) 4 + } + } + hr { + set output(p) 1 + ad_html_to_text_put_text output [string repeat "-" $output(maxlen)] + set output(p) 1 + } + q { + ad_html_to_text_put_text output \" + } + img { + if { [empty_string_p $slash] } { + set img_info {} + if { [info exists attribute_array(alt)] } { + lappend img_info "'$attribute_array(alt)'" + } + if { [info exists attribute_array(src)] } { + lappend img_info $attribute_array(src) + } + if { [llength $img_info] == 0 } { + ad_html_to_text_put_text output {[IMAGE]} + } else { + ad_html_to_text_put_text output "\[IMAGE: [join $img_info " "] \]" + } + } + } + default { + # Other tag + if { $showtags_p } { + ad_html_to_text_put_text output "<$slash$tagname$attributes>" + } + } + } + } + + # set end of last tag to the character following the > + set last_tag_end [incr i] + } + # append everything after the last tag + ad_html_to_text_put_text output [string range $html $last_tag_end end] + + # Close any unclosed tags + set output(pre) 0 + while { $output(blockquote) > 0 } { + incr $output(blockquote) -1 + incr $output(maxlen) 4 + } + + # write out URLs, if necessary: + if { [llength $href_urls] > 0 } { + append output(text) "\n\n[join $href_urls "\n"]" + } + + return $output(text) +} + +ad_proc -public ad_html_to_text_old { + {-maxlen 70} + {-showtags:boolean} + html +} { + Returns a best-guess plain text version of an HTML fragment. + Parses the HTML and does some simple formatting. The parser and + formatting + is pretty stupid, but it's better than nothing. + + @param maxlen the line length you want your output wrapped to. + @param showtags causes any unknown (and uninterpreted) tags to get shown in the output. + @author Lars Pind (lars@pinds.com) @author Aaron Swartz (aaron@swartzfam.com) @creation-date 19 July 2000