1/*2 * Copyright 2000-2004 The Apache Software Foundation.3 * 4 * Licensed under the Apache License, Version 2.0 (the "License");5 * you may not use this file except in compliance with the License.6 * You may obtain a copy of the License at7 * 8 * http://www.apache.org/licenses/LICENSE-2.09 * 10 * Unless required by applicable law or agreed to in writing, software11 * distributed under the License is distributed on an "AS IS" BASIS,12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.13 * See the License for the specific language governing permissions and14 * limitations under the License.15 */1617/*18 *19 *20 * COMPATIBILITY21 * 22 * [28.01.2001, RammerI] Tested on W2K, with J2SE, JDK 1.323 * [29.01.2001, RammerI] Tested on W2K, with JDK 1.2.224 *25 *26 *27 * FEATURES28 * = Rewriting of <A HREFs, <IMG SRCes, <FORM ACTIONs, <TD BACKGROUNDs,29 * <INPUT SRCs, <APPLET CODEBASEs30 * = Removal of <SCRIPT>, <STYLE>, <HEAD>, <EMBED>, <OBJECT>, <APPLET>,31 * <NOSCRIPT>32 * 33 ****34 * Please include the following section in the WebPagePortlet documentation 35 ****36 * <CODE>37 *38 * The following describes how HTML tags are rewritten39 *40 * <!-- --> (HTML Comments)41 * o Unless otherwise mentioned, comments are striped.42 * 43 * <A>44 * o HREF attribute - URL merged with base URL (See Note 1)45 * o TARGET attribute - Set to "_BLANK" if it does not exist 46 * and openInNewWindow = TRUE47 * <AREA>48 * o HREF attribute - URL merged with base URL (See Note 1)49 * o TARGET attribute - Set to "_BLANK" if it does not exist 50 * and openInNewWindow = TRUE51 * <APPLET>52 * o Optionally included53 * o CODEBASE attribute - Set to the current path if it does54 * not exist.55 * 56 * <BASE>57 * o <HEAD> does NOT have to be included.58 * o HREF attribute - Set the Base URL of the page, but the tag59 * not set in resulting HTML. URL merged with60 * base URL (See Note 1)61 * 62 * <BODY>63 * o Background attribute - Always striped.64 * 65 * <EMBED>66 * o May not work. Not supported by JDK 1.3/67 * 68 * <FORM>69 * o ACTION attribute - Set to the current URL if it does70 * not exist. URL merged with base71 * URL (See Note 1)72 * 73 * <IMG>74 * o SRC attribute - URL merged with base URL (See Note 1)75 * 76 * <INPUT>77 * o SRC attribute - URL merged with base URL (See Note 1)78 * 79 * <LINK>80 * o HREF attribute - URL merged with base URL (See Note 1)81 *82 * <OBJECT>83 * o Optionally included84 * o CODEBASE attribute - Set to the current path if it does85 * not exist. URL merged with base86 * URL (See Note 1)87 * 88 * <SCRIPT>89 * o Optionally included90 * o Contents may be striped if this tag appears in the <HEAD>91 * and the contents are NOT in a comment92 * o SRC attribute - URL merged with base URL (See Note 1)93 * o Script code that is NOT enclosed in a comment (<!-- -->)94 * and in the <HEAD> may NOT be in the resulting HTML. This95 * is related to the HTML parser in included in the JDK 96 * 97 * <TD>98 * o BACKGROUND attribute - URL merged with base URL (See Note 1)99 * 100 * Note 1: URL Merging.101 * This is done because the source of the page sent to the102 * user's browser is different then source the current page.103 * Example:104 * Base URL........ http://jakarta.apache.org/jetspeed105 * URL............. logo.gif106 * Resulting URL... http://jakarta.apache.org/jetspeed/logo.gif107 * 108 * </CODE>109 * KNOWN PROBLEMS110 *111 *112 * == Seems to have problems with international characters, when the web-pages113 * are not downloaded from the original URL but taken from the cache.114 * (To reproduce do the following115 * 1. create a new portlet from the url http://www.sycom.at/default.htm116 * 2. stop tomcat & restart tomcat117 * 3. login and customize your page to include this portlet118 * 4. everything should appear fine, the webpage will show some german 119 * umlauts120 * 5. shutdown tomcat and restart it121 * 6. jetspeed is now taking the HTML not from www.sycom.at, but from the122 * cache. Instead of the umlauts, you will see weird characters. 123 *124 *125 * == Does not yet work with XHTML-Pages but only plain-old HTMLs. I.e. Closed126 * single tags like <BR /> screw the output up.127 * 128 *129 *130 */131packageorg.apache.jetspeed.util;
132133import java.io.Reader;
134import java.io.StringWriter;
135import java.net.MalformedURLException;
136import java.net.URL;
137import java.util.Enumeration;
138import javax.swing.text.html.HTML;
139import javax.swing.text.html.HTMLEditorKit;
140import javax.swing.text.MutableAttributeSet;
141142// Jetspeed classes143import org.apache.jetspeed.services.logging.JetspeedLogFactoryService;
144import org.apache.jetspeed.services.logging.JetspeedLogger;
145146/***147 *148 * @author Ingo Rammer (rammer@sycom.at)149 * @author <a href="mailto:sgala@apache.org">Santiago Gala</a>150 * @author <a href="mailto:paulsp@apache.org">Paul Spencer</a>151 * @version 0.2152 */153154publicclassHTMLRewriter155 {
156/***157 * Static initialization of the logger for this class158 */159privatestaticfinalJetspeedLogger logger = JetspeedLogFactoryService.getLogger(HTMLRewriter.class.getName());
160161private HTMLRewriter.Callback cb = new HTMLRewriter.Callback();
162163/*** Sets the parameters for the HTMLRewriter164 * @param removeScript Shall SCRIPT-Tags and their content be removed165 * @param removeStyle Shall STYLE-Tags and their content be removed166 * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed167 * @param removeMeta Shall META-Tags be removed168 * @param removeApplet Shall APPLET-Tags and their content be removed169 * @param removeObject Shall OBJECT-Tags and their content be removed170 * @param removeHead Shall HEAD-Tags and their content be removed171 * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed172 */173publicHTMLRewriter(boolean removeScript,
174boolean removeStyle,
175boolean removeNoScript,
176boolean removeMeta,
177boolean removeApplet,
178boolean removeObject,
179boolean removeHead,
180boolean removeOnSomething) {
181 init ( removeScript,
182 removeStyle,
183 removeNoScript,
184 removeMeta,
185 removeApplet,
186 removeObject,
187 removeHead,
188 removeOnSomething,
189 false);
190 }
191192/***193 * Sets the parameters for the HTMLRewriter194 * @param removeScript Shall SCRIPT-Tags and their content be removed195 * @param removeStyle Shall STYLE-Tags and their content be removed196 * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed197 * @param removeMeta Shall META-Tags be removed198 * @param removeApplet Shall APPLET-Tags and their content be removed199 * @param removeObject Shall OBJECT-Tags and their content be removed200 * @param removeHead Shall HEAD-Tags and their content be removed201 * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed202 */203publicHTMLRewriter(boolean removeScript,
204boolean removeStyle,
205boolean removeNoScript,
206boolean removeMeta,
207boolean removeApplet,
208boolean removeObject,
209boolean removeHead,
210boolean removeOnSomething,
211boolean openInNewWindow ) {
212 init ( removeScript,
213 removeStyle,
214 removeNoScript,
215 removeMeta,
216 removeApplet,
217 removeObject,
218 removeHead,
219 removeOnSomething,
220 openInNewWindow );
221 }
222223/***224 * Sets the parameters for the HTMLRewriter225 *226 * @param removeScript Shall SCRIPT-Tags and their content be removed227 * @param removeStyle Shall STYLE-Tags and their content be removed228 * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed229 * @param removeMeta Shall META-Tags be removed230 * @param removeApplet Shall APPLET-Tags and their content be removed231 * @param removeObject Shall OBJECT-Tags and their content be removed232 * @param removeHead Shall HEAD-Tags and their content be removed233 * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed234 * @param openInNewWindow Shall links set Target="_blank"235 */236privatevoid init (boolean removeScript,
237boolean removeStyle,
238boolean removeNoScript,
239boolean removeMeta,
240boolean removeApplet,
241boolean removeObject,
242boolean removeHead,
243boolean removeOnSomething,
244boolean openInNewWindow )
245 {
246 cb.removeScript = removeScript;
247 cb.removeStyle = removeStyle;
248 cb.removeNoScript = removeNoScript;
249 cb.removeMeta = removeMeta;
250 cb.removeApplet = removeApplet;
251 cb.removeObject = removeObject;
252 cb.removeHead = removeHead;
253 cb.removeOnSomething = removeOnSomething;
254 cb.openInNewWindow = openInNewWindow;
255 }
256257/***258 * Does the conversion of the HTML259 * @param HTMLrdr Reader for HTML to be converted260 * @param BaseUrl URL from which this HTML was taken. We be the base-Url261 * for all URL-rewritings.262 * @throws MalformedURLException If the BaseUrl is not a valid URL or if an URL inside263 * the document could not be converted. Should not happen264 * normally, even in badly formatted HTML.265 * @return HTML-String with rewritten URLs and removed (according266 * to constructor-settings) tags267 */268publicsynchronized String convertURLs(Reader HTMLrdr, String BaseUrl) throws MalformedURLException
269 {
270 HTMLEditorKit.Parser parse = new HTMLRewriter.ParserGetter().getParser();
271 String res ="";
272try {
273if (cb.result != null) {
274 cb.result = null;
275 cb.result = new StringWriter();
276 }
277 cb.baseUrl = new URL(BaseUrl);
278 parse.parse(HTMLrdr,cb,true);
279 res = cb.getResult();
280 } catch (Exception e)
281 {
282 logger.error( "Unable to convertURLS", e );
283thrownew MalformedURLException(e.toString());
284 }
285return res;
286 }
287288289/*** That Class is needed, because getParser is protected and therefore 290 * only accessibly by a subclass291 */292class ParserGetter extends HTMLEditorKit {
293/*** This is needed, because getParser is protected294 * @return Html Parser295 */296public HTMLEditorKit.Parser getParser(){
297returnsuper.getParser();
298 }
299 }
300301302class Callback extends HTMLEditorKit.ParserCallback {
303304// the base-url of which the given html comes from.305private URL baseUrl;
306307// either handling of <FORM> is buggy, or I made some weird mistake ... 308// ... JDK 1.3 sends double "</form>"-tags on closing <form>309privateboolean inForm = false;
310311312// when in multi-part ignored tags (like <script> foobar </script>, 313// <style> foobar </style>, a counter for the nesting-level will be314// kept here315privateint ignoreLevel = 0;
316317privateboolean removeScript = true;
318privateboolean removeStyle = true;
319privateboolean removeNoScript = true;
320privateboolean removeMeta = true;
321privateboolean removeApplet = true;
322privateboolean removeObject = true;
323privateboolean removeHead = true;
324privateboolean openInNewWindow = false;
325326// remove the onClick=, onBlur=, etc. - Attributes327privateboolean removeOnSomething = true;
328329privateboolean inScript = false;
330privateboolean inStyle = false;
331332private StringWriter result = new StringWriter();
333334private Callback () {
335 }
336337338private Callback addToResult(Object txt)
339 {
340// to allow for implementation using Stringbuffer or StringWriter341// I don't know yet, which one is better in this case342if (ignoreLevel > 0) returnthis;
343344try {
345 result.write(txt.toString());
346 } catch (Exception e) { /* ignore */ }347returnthis;
348 }
349350private Callback addToResult(char[] txt)
351 {
352if (ignoreLevel > 0) returnthis;
353354try {
355 result.write(txt);
356 } catch (Exception e) { /* ignore */ }357returnthis;
358 }
359360/*** Accessor to the Callback's content-String361 * @return Cleaned and rewritten HTML-Content362 */363public String getResult() {
364try {
365 result.flush();
366 } catch (Exception e) { /* ignore */ }367368// WARNING: doesn't work, if you remove " " + ... but don't know why369 String res = " " + result.toString();
370371return res;
372 }
373374375publicvoid flush() throws javax.swing.text.BadLocationException {
376// nothing to do here ...377 }
378379/*** 380 * Because Scripts and Stlyle sometimes are defined in comments, thoese381 * will be written. Otherwise comments are removed382 */383publicvoid handleComment(char[] values,int param) {
384if ( !( inStyle || inScript))
385return;
386387try {
388 result.write("<!--");
389 result.write(values);
390 result.write("-->");
391 } catch (Exception e) { /* ignore */ }392// we ignore them 393 }
394395publicvoid handleEndOfLineString(java.lang.String str) {
396 addToResult("\n");
397 }
398399publicvoid handleError(java.lang.String str,int param) {
400// ignored401 }
402403publicvoid handleSimpleTag(HTML.Tag tag,MutableAttributeSet attrs,int param) {
404if (removeMeta && (tag == HTML.Tag.META)) {
405return;
406 }
407 appendTagToResult(tag,attrs);
408 }
409410publicvoid handleStartTag(HTML.Tag tag, MutableAttributeSet attrs, int position) {
411 appendTagToResult(tag,attrs);
412 }
413414publicvoid handleEndTag(HTML.Tag tag, int position) {
415if ((tag ==HTML.Tag.FORM) && (inForm)) {
416// form handling seems to be buggy417 addToResult("</").addToResult(tag).addToResult(">");
418 inForm = false;
419 } elseif (tag == HTML.Tag.FORM) {
420// do nothing! ... i.e. we are now outside of any <FORM>, so a421// closing </form> is not really needed ... 422 } else {
423 addToResult("</").addToResult(tag).addToResult(">");
424 }
425426427if ( (removeScript == false) && (tag == HTML.Tag.SCRIPT)) {
428 inScript = false;
429 } elseif ( (removeStyle == false) && (tag == HTML.Tag.STYLE)) {
430 inStyle = false;
431 }
432433if ( removeScript && (tag == HTML.Tag.SCRIPT)) {
434 ignoreLevel --;
435 } elseif ( removeStyle && (tag == HTML.Tag.STYLE)) {
436 ignoreLevel --;
437 } elseif ( removeHead && (tag == HTML.Tag.HEAD)) {
438 ignoreLevel --;
439 } elseif ( removeApplet && (tag == HTML.Tag.APPLET)) {
440 ignoreLevel --;
441 } elseif ( removeObject && (tag == HTML.Tag.OBJECT)) {
442 ignoreLevel --;
443 } elseif ( removeNoScript && (tag.toString().equalsIgnoreCase("NOSCRIPT"))) {
444 ignoreLevel --;
445 }
446 }
447448privatevoid appendTagToResult(HTML.Tag tag, MutableAttributeSet attrs) {
449450if (tag.toString().equalsIgnoreCase("__ENDOFLINETAG__")) {
451// jdk 1.2.2 places a tag <__ENDOFLINETAG__> in the result ...452// we don't want this one453return;
454 }
455456if (tag.toString().equalsIgnoreCase("__IMPLIED__")) {
457// jdk 1.3 places a tag <__IMPLIED__> in the result ...458// we don't want this one459return;
460 }
461462 convertURLS(tag,attrs);
463 Enumeration e = attrs.getAttributeNames();
464if (tag == HTML.Tag.BASE)
465return;
466467 addToResult("<").addToResult(tag);
468while (e.hasMoreElements()) {
469 Object attr = e.nextElement();
470 String attrName = attr.toString();
471 String value = attrs.getAttribute(attr).toString();
472473// include attribute only when Not(RemoveOnSomething = True and starts with "on")474if (!(removeOnSomething
475 && attrName.toLowerCase().startsWith("on")
476 && (attrName.length() > 2))) {
477// Attribute included478 addToResult(" ").addToResult(attr).addToResult("=\"")
479 .addToResult(value).addToResult("\"");
480 }
481 }
482 addToResult(">");
483 }
484485/*** Here the magic happens.486 *487 * If someone wants new types of URLs to be rewritten, add them here488 * @param tag TAG from the Callback-Interface489 * @param attrs Attribute-Set from the Callback-Interface490 */491492privatevoid convertURLS( HTML.Tag tag, MutableAttributeSet attrs ) {
493494// first we do an URL-rewrite on different tags495496if (tag == HTML.Tag.A) {
497if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
498// ---- CHECKING <A HREF499 addConvertedAttribute( HTML.Attribute.HREF,
500 attrs );
501 }
502if ((attrs.getAttribute(HTML.Attribute.TARGET) == null) && cb.openInNewWindow) {
503 attrs.addAttribute(HTML.Attribute.TARGET, "_BLANK");
504 }
505 } elseif (tag == HTML.Tag.AREA) {
506if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
507// ---- CHECKING <A HREF508 addConvertedAttribute( HTML.Attribute.HREF,
509 attrs );
510 }
511if ((attrs.getAttribute(HTML.Attribute.TARGET) == null) && cb.openInNewWindow) {
512 attrs.addAttribute(HTML.Attribute.TARGET, "_BLANK");
513 }
514 } elseif (((tag == HTML.Tag.IMG) || (tag == HTML.Tag.INPUT) || (tag == HTML.Tag.SCRIPT))
515 && (attrs.getAttribute(HTML.Attribute.SRC) != null)) {
516// ---- CHECKING <IMG SRC & <INPUT SRC517 addConvertedAttribute( HTML.Attribute.SRC,
518 attrs );
519 } elseif (tag == HTML.Tag.LINK) {
520if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
521// ---- CHECKING <LINK HREF522 addConvertedAttribute( HTML.Attribute.HREF,
523 attrs );
524 }
525 } elseif ( tag == HTML.Tag.APPLET ) {
526// ---- CHECKING <APPLET CODEBASE=527if (attrs.getAttribute(HTML.Attribute.CODEBASE) == null) {
528int endOfPath = baseUrl.toString().lastIndexOf("/");
529 attrs.addAttribute(HTML.Attribute.CODEBASE,
530 baseUrl.toString().substring(0,endOfPath +1));
531 } else {
532 addConvertedAttribute( HTML.Attribute.CODEBASE, attrs );
533 }
534 } elseif (tag == HTML.Tag.OBJECT) {
535// ---- CHECKING <OBJECT CODEBASE=536if (attrs.getAttribute(HTML.Attribute.CODEBASE) == null) {
537int endOfPath = baseUrl.toString().lastIndexOf("/");
538 attrs.addAttribute(HTML.Attribute.CODEBASE,
539 baseUrl.toString().substring(0,endOfPath +1));
540 } else {
541 addConvertedAttribute( HTML.Attribute.CODEBASE, attrs );
542 }
543 } elseif (tag == HTML.Tag.BODY) {
544if (attrs.getAttribute(HTML.Attribute.BACKGROUND) != null) {
545// background images are applied to the ENTIRE page, this remove them!546 attrs.removeAttribute( HTML.Attribute.BACKGROUND);
547 }
548 } elseif (tag == HTML.Tag.BASE) {
549if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
550try {
551 baseUrl = new URL(attrs.getAttribute(HTML.Attribute.HREF).toString());
552 } catch (Throwable t) {
553 logger.error( "HTMLRewriter: Setting BASE="554 + attrs.getAttribute(HTML.Attribute.HREF).toString()
555 + t.getMessage());
556 }
557 attrs.removeAttribute(HTML.Attribute.HREF);
558 }
559 } elseif (tag == HTML.Tag.FORM) {
560// ---- CHECKING <FORM ACTION=561 inForm = true; // buggy <form> handling in jdk 1.3 562if (attrs.getAttribute(HTML.Attribute.ACTION) == null) {
563//self referencing <FORM>564 attrs.addAttribute(HTML.Attribute.ACTION,
565 baseUrl.toString());
566 } else {
567 addConvertedAttribute( HTML.Attribute.ACTION,
568 attrs );
569 }
570 } elseif (tag == HTML.Tag.TD) {
571// ---- CHECKING <TD BACKGROUND=572if (! (attrs.getAttribute(HTML.Attribute.BACKGROUND) == null)) {
573 addConvertedAttribute( HTML.Attribute.BACKGROUND,
574 attrs );
575 }
576 }
577578579// then we check for ignored tags ...580// btw. I know, that this code could be written in a shorter way, but581// I think it's more readable like this ...582583// don't forget to add changes to handleEndTag() as well, else 584// things will get screwed up!585586if ( (removeScript == false) && (tag == HTML.Tag.SCRIPT)) {
587 inScript = true;
588 } elseif ( (removeStyle == false) && (tag == HTML.Tag.STYLE)) {
589 inStyle = true;
590 }
591592if ( removeScript && (tag == HTML.Tag.SCRIPT)) {
593 ignoreLevel ++;
594 } elseif ( removeStyle && (tag == HTML.Tag.STYLE)) {
595 ignoreLevel ++;
596 } elseif ( removeHead && (tag == HTML.Tag.HEAD)) {
597 ignoreLevel ++;
598 } elseif ( removeApplet && (tag == HTML.Tag.APPLET)) {
599 ignoreLevel ++;
600 } elseif ( removeObject && (tag == HTML.Tag.OBJECT)) {
601 ignoreLevel ++;
602 } elseif (removeNoScript && (tag.toString().equalsIgnoreCase("NOSCRIPT"))) {
603 ignoreLevel ++;
604 }
605 }
606607/***608 *609 * Converts the given attribute to base URL, if not null610 *611 */612privatevoid addConvertedAttribute( HTML.Attribute attr,
613 MutableAttributeSet attrs ) {
614if( attrs.getAttribute( attr ) != null ) {
615 String attrSource = attrs.getAttribute( attr ).toString();
616 attrs.addAttribute( attr,
617 generateNewUrl( attrSource ) );
618 }
619 }
620621622private String generateNewUrl(String oldURL) {
623try {
624 URL x = new URL(baseUrl,oldURL);
625return x.toString();
626 } catch (Throwable t) {
627if (oldURL.toLowerCase().startsWith("javascript:")) {
628return oldURL;
629 }
630 logger.error( "HTMLRewriter: Setting BASE="631 + baseUrl
632 + " Old = "633 + oldURL
634 + t.getMessage());
635return oldURL; // default behaviour ...636 }
637 }
638639publicvoid handleText(char[] values,int param) {
640 addToResult(values);
641 }
642 }
643 }