import java.util.regex.Pattern; import java.util.regex.Matcher; public class AutoFormat { public static StringBuffer replaceAll( StringBuffer in, String match, String replace ) { int pos; int ml = match.length(); int rl = replace.length(); pos = in.indexOf( match ); while ( pos >= 0 ) { in.replace( pos, pos + ml, replace ); pos = in.indexOf( match, pos + rl ); } return in; } /* this function has been pretty faithfully translated from the perl */ static StringBuffer plaintextFormat( StringBuffer in, boolean noescapetags ) { // Remove excess whitespace from the front and end of the text while ( in.length() > 0 && Character.isWhitespace( in.charAt( 0 ) ) ) { in.deleteCharAt( 0 ); } if ( in.length() <= 0 ) { return in; } int lastci; lastci = in.length() - 1; while ( Character.isWhitespace( in.charAt( lastci ) ) ) { in.deleteCharAt( lastci ); lastci = in.length() - 1; } // Perform standard plain-old-text conversions if ( ! noescapetags ) { in = replaceAll( in, "&", "&" ); in = replaceAll( in, "\"", """ ); in = replaceAll( in, "<", "<" ); in = replaceAll( in, ">", ">" ); } // in = replaceAll( in, "\r", "" ); in = replaceAll( in, "\t", " " ); // in = replaceAll( in, "\u00a0", " " ); // what is this? // filter chars int pos = 0; char prevc = '\0'; while ( pos < in.length() ) { char c; c = in.charAt( pos ); if ( c <= 8 || (c >= 0x0B && c <= 0x1F) ) { //s#[\x00-\x08\x0B-\x1F]##g; # Nuke control characters // strip control chars in.deleteCharAt( pos ); //} else if ( c == '\t' ) { // in.replace( pos, pos + 1, " " ); } else if ( c == '\n' ) { boolean run = false; int np = pos + 1; char nc = in.charAt( np ); int endnl = -1; while ( Character.isWhitespace( nc ) ) { run = true; if ( nc == '\n' ) { endnl = np; } np++; nc = in.charAt( np ); } if ( run && (endnl > 0) ) { //s#\n\s*\n#

\n#gs; in.replace( pos, endnl, "

" ); pos = endnl + 4; prevc = '>'; } else { //s#(?)\n#
\n#gs; in.insert( pos, "
" ); pos += 5; prevc = '>'; } } else if ( prevc == '\n' && c == ' ' ) { //s#^ # #gm; in.replace( pos - 1, pos + 1, " " ); pos += 5; prevc = ';'; } else if ( Character.isWhitespace( prevc ) && Character.isWhitespace( c ) ) { int lastsp = pos + 1; while ( lastsp < in.length() && Character.isWhitespace( in.charAt( lastsp ) ) ) { lastsp++; } //s# # #g; in.replace( pos - 1, lastsp, " " ); pos = lastsp + 5; prevc = ';'; //} else if ( ! noescapetags && () ) { // Change remaining non-ASCII chars to entities //s!([^\n\t\x20-\x7E])!'&#'.ord($1).';'!ge unless $noescapetags; } else { prevc = c; pos++; } } return in; } /* maybe pull urlSchemes from a db set variable? */ static String[] urlSchemes = { "http://", "https://", "ftp://", //"file://", "mailto:", }; /** allowed forms of the original perl version: [url text] [text url] {url text} {text url} url text is [^\[][^]]+? inside [] [^{][^}]+? inside {} for naked url, don't include any of ".!?_*=" as part of the url if they are the last char, and it should be followed by whitespace, one of "\n<()[]{}" or the end of the input. currently only naked urls are augmented and they are considered as extending until whitespace or '<' or '>' with no special trimmings. Don't people know HTML yet? Do we still need the shorthand [] forms? */ static StringBuffer autoLinkifyUrls( StringBuffer in ) { int pos = 0; int[] nextSchemePos = new int[urlSchemes.length]; int i; int minSchemePos = in.length(); int minScheme = -1; for ( i = 0; i < urlSchemes.length; i++ ) { nextSchemePos[i] = in.indexOf( urlSchemes[i] ); if ( (nextSchemePos[i] >= 0) && (nextSchemePos[i] < minSchemePos) ) { minSchemePos = nextSchemePos[i]; minScheme = i; } } int nextLT = in.indexOf( "<" ); while ( pos < in.length() ) { boolean needFullScan = false; if ( minSchemePos >= in.length() ) { // no matching schemes to linkify return in; } nextLT = in.indexOf( "<", pos ); if ( nextLT >= 0 && nextLT < minSchemePos ) { int nextGT = in.indexOf( ">", nextLT ); if ( nextGT > 0 ) { // zoom scan to after end of tag pos = nextGT + 1; needFullScan = true; } else { // bogus < open tag! in.replace( nextLT, nextLT + 1, "<" ); needFullScan = true; pos = nextLT + 4; } } else { // no '<' before match, linkify ... int linkend = minSchemePos + urlSchemes[minScheme].length() - 1; char c; do { linkend++; if ( linkend >= in.length() ) { break; } c = in.charAt( linkend ); } while ( !Character.isWhitespace( c ) && c != '<' && c != '>' ); String url = in.substring( minSchemePos, linkend ); String link = "" + url + ""; in.replace( minSchemePos, linkend, link ); needFullScan = true; pos = minSchemePos + link.length() + 1; //nextSchemePos[minScheme] = in.indexOf( urlSchemes[minScheme], pos ); } // find next first minSchemePos = in.length(); for ( i = 0; i < urlSchemes.length; i++ ) { if ( needFullScan ) { nextSchemePos[i] = in.indexOf( urlSchemes[i], pos ); } if ( (nextSchemePos[i] >= 0) && (nextSchemePos[i] < minSchemePos) ) { minSchemePos = nextSchemePos[i]; minScheme = i; } } } // Don't match URLs with <> in them // my $url_regex = '(?:http|ftp|file)://[^\s<>]+?'; // my $url_regexg = '(?:http|ftp|file)://[^\s<>]+'; # greedy // Mark URLs that are already in HTML attrs or links so we don't linkify them /* s#(<[^>]+="[^">]*)($url_regex)#$1\x00$2#gso; s#(]*href=[^>]*>[^<]*)($url_regex)#$1\x00$2#gso; # Grab expressions in brackets ('[]', '{}', or '<>', not '()') # and if they end in a URL, linkify them. s#\[([^\[][^]]+?)(?:\s| )*?($url_regex)\]#$1#gmsio; s#{([^{][^}]+?)(?:\s| )*($url_regex)}#$1#gmsio; s#\[($url_regexg)(?:\s| )+([^\[][^]]+?)\]#$2#gmsio; s#{($url_regexg)(?:\s| )+([^{][^}]+?)}#$2#gmsio; # Linkify all the remaining naked URLs s#([^\x00]|^)($url_regex)(?=[.!?_*=]?[\s\n<()\[\]{}\x01\x02]|$)#$1$2#gmio; # Remove placeholder chars s#\x00##gs;*/ return in; } static abstract class PatternFilter { PatternFilter next; protected abstract Pattern pat(); public void filter( CharSequence in ) { Matcher lilm = pat().matcher( in ); int pos = 0; if ( lilm.find() ) { do { next.filter( in.subSequence( pos, lilm.start() ) ); core( lilm ); pos = lilm.end(); } while ( lilm.find() ); next.filter( in.subSequence( pos, in.length() ) ); } else { lilm = null; next.filter( in ); } } public void core( Matcher lilm ) { next.pass( lilm.group() ); } public void pass( CharSequence cs ) { next.pass( cs ); } } static class LinkNoBreakFilter extends PatternFilter { static Pattern linkNoBreak = Pattern.compile( "(]*href=[^>]*>[^<]*" + urlRegex + "?|<[^>]+=\"[^\">]*" + urlRegex + "?[^>]*>)", Pattern.CASE_INSENSITIVE ); protected Pattern pat() { return linkNoBreak; } } static class BracketFilterA extends PatternFilter { static Pattern linkNoBreak = Pattern.compile( "\\[([^\\[][^]]+?)(?:\\s| )*?(" + urlRegex + ")\\]", Pattern.CASE_INSENSITIVE ); protected Pattern pat() { return linkNoBreak; } public void core( Matcher lilm ) { String text = lilm.group( 1 ); String url = lilm.group( 2 ); next.pass( "" + text + "" ); } } static class BracketFilterB extends PatternFilter { // s#\[($url_regexg)(?:\s| )+([^\[][^]]+?)\]#$2#gmsio; static Pattern linkNoBreak = Pattern.compile( "\\[(" + urlRegex + ")(?:\\s| )+([^\\[][^]]+?)\\]", Pattern.CASE_INSENSITIVE ); protected Pattern pat() { return linkNoBreak; } public void core( Matcher lilm ) { String text = lilm.group( 2 ); String url = lilm.group( 1 ); next.pass( "" + text + "" ); } } static class NakedURLFilter extends PatternFilter { /* s#([^\x00]|^)($url_regex)(?=[.!?_*=]?[\s\n<()\[\]{}\x01\x02]|$)#$1$2#gmio; */ static Pattern linkNoBreak = Pattern.compile( "(" + urlRegex + "?)(?=[.!?_*=]?[\\s\\n<()\\[\\]{}]|$)", Pattern.CASE_INSENSITIVE ); protected Pattern pat() { return linkNoBreak; } public void core( Matcher lilm ) { String url = lilm.group( 1 ); next.pass( "" + url + "" ); } } static class SBAccumFilter extends PatternFilter { StringBuffer toret = new StringBuffer(); protected Pattern pat() { return null; } public void filter( CharSequence cs ) { toret.append( cs ); } public void pass( CharSequence cs ) { toret.append( cs ); } } static PatternFilter newDefaultFilterChain() { SBAccumFilter tail = new SBAccumFilter(); NakedURLFilter nu = new NakedURLFilter(); nu.next = tail; BracketFilterA bfa = new BracketFilterA(); bfa.next = nu; BracketFilterB bfb = new BracketFilterB(); bfb.next = bfa; LinkNoBreakFilter lnbf = new LinkNoBreakFilter(); lnbf.next = bfb; return lnbf; } static String ltgt( String in ) { return in.replaceAll( "<", "<" ).replaceAll( ">", ">" ); } static String urlRegex = "(?:http://|ftp://|https://|mailto:)[^\\s<>]+"; static Pattern linkInTag = Pattern.compile("(<[^>]+=\"[^\">]*" + urlRegex + "?[^>]*>)"); static Pattern linkInLink = Pattern.compile("(]*href=[^>]*>[^<]*" + urlRegex + "?)"); static Pattern linkNoBreak = Pattern.compile( "(]*href=[^>]*>[^<]*" + urlRegex + "?|<[^>]+=\"[^\">]*" + urlRegex + "?[^>]*>)"); static StringBuffer autoLinkifyUrlsR( StringBuffer in ) { PatternFilter filts = newDefaultFilterChain(); filts.filter( in ); while ( filts.next != null ) { filts = filts.next; } return ((SBAccumFilter)filts).toret; /* Matcher lilm = linkNoBreak.matcher( in ); System.out.println(""); int pos = 0; while ( lilm.find() ) { System.out.println(""); pos = lilm.end(); } System.out.println(""); */ /*String[] they = linkInLink.split( in ); for ( int i = 0; i < they.length; i++ ) { System.out.println(""); }*/ //System.out.println("
"); System.out.println( ltgt(in.substring( pos, lilm.start() )) ); System.out.println( " " ); System.out.println( ltgt(lilm.group()) ); System.out.println("
"); System.out.println( ltgt(in.substring( pos )) ); System.out.println( " " ); //System.out.println( lilm.group() ); System.out.println("
"); System.out.println( i ); System.out.println( " " ); System.out.println( they[i] ); System.out.println("
"); //return in; } /* Most of this function has not been reimplemented because it is crazy or totally the wrong way to do it outside of perl (and possibly inside of perl) */ static StringBuffer autoFormat( StringBuffer in ) { //local $_ = shift; //my $context = shift; /* // Since we'll be using high-bit characters to mark escaped characters, // turn existing ones into entities here. s!([\x80-\xFF])!'&#'.ord($1).';'!ge; // Escape significant characters preceded by a backslash s#\\\\#\xDC#g; // Escape double backslashes first s#\\<#\x81#g; s#\\>#\x82#g; s#\\&#\x83#g; s#\\"#\x84#g; s#\\(\S)#chr(ord($1)|0x80)#ge; // Mark the high bit */ // Change non-HTML-involved &< into entities // s/&(?![A-Za-z0-9#]+;)/&/g; // s#<(?![A-Za-z/])#<#g; // Perform plaintext formatting in = plaintextFormat( in, true ); // Run the URL linkifier here so that clean_html's word breaking doesn't mess things up //in = autoLinkifyUrls( in ); // use superior new regex-filter based autolinkify in = autoLinkifyUrlsR( in ); /* // Clean up HTML tags // this is probably a pretty big project, but hasn't it already been done somewhere? [bolson] my $comment_ref = $S->html_checker->clean_html(\$_, $context); $_ = $$comment_ref; // Make non-HTML-involved <>&" easier to sniff out. s#<#\x01#g; # \x01 == < for now s#>#\x02#g; # \x02 == > for now s#&#\x03#g; # \x03 == & for now s#"#\x04#g; # \x04 == " for now // Escape any potentially special chars within tags and URLs exactly as if // the user had escaped them with a backslash my $url_regexg = '(?:http|ftp|file)://(?:[^\s<]|\Z)+(?=[\s<]|\Z)'; my $tag_regex = '<[^><]*?>'; s!($url_regexg|$tag_regex)! my $a = $1; $a =~ s#([^a-zA-Z0-9])#chr(ord($1)|0x80)#ge; $a !ge; $_ = $S->_auto_bold_italic($_); $_ = $S->_auto_create_ul($_); $_ = $S->_auto_create_ol($_); // Switch back the marked characters s#([\x80-\xFF])#chr(ord($1)&0x7F)#ge; s#\x01#<#g; s#\x02#>#g; s#\x03#&#g; s#\x04#"#g;*/ return in; } /** test autoformat */ public static String[] testInputs = { "a foo haha aoeu\nthingy\n\n\nyadda", "http://frond.com/boo", "wheee .... thud\nmailto:link@this", "[implement text ftp://link]\n\n[https://also implement link then text]\n\ngot it?", "http://bolson.org don't break this by overlinking it!", "FTP://saoeuraoeu\n\n[case INSENSITIVITY test mailto:colonel@cwru.edu]", }; public static void main( String[] argv ) { System.out.println("" ); for ( int i = 0; i < testInputs.length; i++ ) { StringBuffer sba, sbb; sba = autoFormat( new StringBuffer( testInputs[i] ) ); sbb = autoLinkifyUrls( plaintextFormat( new StringBuffer( testInputs[i] ), true ) ); System.out.println( "" ); System.out.println( "" ); } System.out.println( "
" ); System.out.println( testInputs[i] ); System.out.println( "
" ); System.out.println( sba ); System.out.println( " " ); System.out.println( sbb ); System.out.println( "
" ); System.out.println( ltgt(sba.toString()) ); System.out.println( " " ); System.out.println( ltgt(sbb.toString()) ); System.out.println( "
" ); if ( false ) { try { java.io.BufferedReader in = new java.io.BufferedReader( new java.io.InputStreamReader( System.in ) ); String line; while ( (line = in.readLine()) != null ) { System.out.println( autoFormat( new StringBuffer( line ) ) ); } } catch ( Exception e ) { e.printStackTrace(); } } System.out.println(""); } }

"); System.out.println( ltgt(in.substring( pos, lilm.start() )) ); System.out.println( "	" ); System.out.println( ltgt(lilm.group()) ); System.out.println("
"); System.out.println( ltgt(in.substring( pos )) ); System.out.println( "	" ); //System.out.println( lilm.group() ); System.out.println("
"); System.out.println( i ); System.out.println( "	" ); System.out.println( they[i] ); System.out.println("

" ); System.out.println( testInputs[i] ); System.out.println( "	" ); System.out.println( sba ); System.out.println( "	" ); System.out.println( sbb ); System.out.println( "
	" ); System.out.println( ltgt(sba.toString()) ); System.out.println( "	" ); System.out.println( ltgt(sbb.toString()) ); System.out.println( "