View Javadoc

1   /*
2    * Copyright 2000-2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  /*
18   *
19   *
20   *  COMPATIBILITY
21   *  
22   *      [28.01.2001, RammerI] Tested on W2K, with J2SE, JDK 1.3
23   *      [29.01.2001, RammerI] Tested on W2K, with JDK 1.2.2
24   *
25   *
26   *
27   *  FEATURES
28   *      = Rewriting of <A HREFs, <IMG SRCes, <FORM ACTIONs, <TD BACKGROUNDs,
29   *          <INPUT SRCs, <APPLET CODEBASEs
30   *      = Removal of <SCRIPT>, <STYLE>, <HEAD>, <EMBED>, <OBJECT>, <APPLET>,
31   *          <NOSCRIPT>
32   * 
33   ****
34   * Please include the following section in the WebPagePortlet documentation     
35   ****
36   * <CODE>
37   *
38   * The following describes how HTML tags are rewritten
39   *
40   * <!-- --> (HTML Comments)
41   *   o Unless otherwise mentioned, comments are striped.
42   * 
43   * <A>
44   *   o HREF attribute   - URL merged with base URL (See Note 1)
45   *   o TARGET attribute - Set to "_BLANK" if it does not exist 
46   *                        and openInNewWindow = TRUE
47   * <AREA>
48   *   o HREF attribute   - URL merged with base URL (See Note 1)
49   *   o TARGET attribute - Set to "_BLANK" if it does not exist 
50   *                        and openInNewWindow = TRUE
51   * <APPLET>
52   *   o Optionally included
53   *   o CODEBASE attribute - Set to the current path if it does
54   *                          not exist.
55   * 
56   * <BASE>
57   *   o <HEAD> does NOT have to be included.
58   *   o HREF attribute  - Set the Base URL of the page, but the tag
59   *                       not set in resulting HTML. URL merged with
60   *                       base URL (See Note 1)
61   * 
62   * <BODY>
63   *   o Background attribute - Always striped.
64   * 
65   * <EMBED>
66   *   o May not work.  Not supported by JDK 1.3/
67   * 
68   * <FORM>
69   *   o ACTION attribute - Set to the current URL if it does
70   *                        not exist. URL merged with base
71   *                        URL (See Note 1)
72   * 
73   * <IMG>
74   *   o SRC attribute - URL merged with base URL (See Note 1)
75   * 
76   * <INPUT>
77   *   o SRC attribute - URL merged with base URL (See Note 1)
78   * 
79   * <LINK>
80   *   o HREF attribute - URL merged with base URL (See Note 1)
81   *
82   * <OBJECT>
83   *   o Optionally included
84   *   o CODEBASE attribute - Set to the current path if it does
85   *                          not exist. URL merged with base
86   *                          URL (See Note 1)
87   * 
88   * <SCRIPT>
89   *   o Optionally included
90   *   o Contents may be striped if this tag appears in the <HEAD>
91   *     and the contents are NOT in a comment
92   *   o SRC attribute - URL merged with base URL (See Note 1)
93   *   o Script code that is NOT enclosed in a comment (<!-- -->)
94   *     and in the <HEAD> may NOT be in the resulting HTML.  This
95   *     is related to the HTML parser in included in the JDK 
96   * 
97   * <TD>
98   *   o BACKGROUND attribute - URL merged with base URL (See Note 1)
99   * 
100  * Note 1: URL Merging.
101  *   This is done because the source of the page sent to the
102  *   user's browser is different then source the current page.
103  *   Example:
104  *     Base URL........ http://jakarta.apache.org/jetspeed
105  *     URL............. logo.gif
106  *     Resulting URL... http://jakarta.apache.org/jetspeed/logo.gif
107  * 
108  * </CODE>
109  *  KNOWN PROBLEMS
110  *
111  *
112  *  == Seems to have problems with international characters, when the web-pages
113  *     are not downloaded from the original URL but taken from the cache.
114  *     (To reproduce do the following
115  *      1. create a new portlet from the url http://www.sycom.at/default.htm
116  *      2. stop tomcat & restart tomcat
117  *      3. login and customize your page to include this portlet
118  *      4. everything should appear fine, the webpage will show some german 
119  *         umlauts
120  *      5. shutdown tomcat and restart it
121  *      6. jetspeed is now taking the HTML not from www.sycom.at, but from the
122  *         cache. Instead of the umlauts, you will see weird characters. 
123  *
124  *
125  *  == Does not yet work with XHTML-Pages but only plain-old HTMLs. I.e. Closed
126  *     single tags like <BR /> screw the output up.
127  *      
128  *
129  *
130  */
131 package org.apache.jetspeed.util;
132 
133 import java.io.Reader;
134 import java.io.StringWriter;
135 import java.net.MalformedURLException;
136 import java.net.URL;
137 import java.util.Enumeration;
138 import javax.swing.text.html.HTML;
139 import javax.swing.text.html.HTMLEditorKit;
140 import javax.swing.text.MutableAttributeSet;
141 
142 // Jetspeed classes
143 import org.apache.jetspeed.services.logging.JetspeedLogFactoryService;
144 import org.apache.jetspeed.services.logging.JetspeedLogger;
145 
146 /***
147  *
148  * @author  Ingo Rammer (rammer@sycom.at)
149  * @author <a href="mailto:sgala@apache.org">Santiago Gala</a>
150  * @author <a href="mailto:paulsp@apache.org">Paul Spencer</a>
151  * @version 0.2
152  */
153 
154 public class HTMLRewriter 
155 {
156     /***
157      * Static initialization of the logger for this class
158      */    
159     private static final JetspeedLogger logger = JetspeedLogFactoryService.getLogger(HTMLRewriter.class.getName());
160     
161     private HTMLRewriter.Callback cb = new HTMLRewriter.Callback();
162     
163 /*** Sets the parameters for the HTMLRewriter
164  * @param removeScript Shall SCRIPT-Tags and their content be removed
165  * @param removeStyle Shall STYLE-Tags and their content be removed
166  * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
167  * @param removeMeta Shall META-Tags be removed
168  * @param removeApplet Shall APPLET-Tags and their content be removed
169  * @param removeObject Shall OBJECT-Tags and their content be removed
170  * @param removeHead Shall HEAD-Tags and their content be removed
171  * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
172  */    
173     public HTMLRewriter(boolean removeScript,
174                         boolean removeStyle,
175                         boolean removeNoScript,
176                         boolean removeMeta,
177                         boolean removeApplet,
178                         boolean removeObject,
179                         boolean removeHead,
180                         boolean removeOnSomething) {
181         init ( removeScript,
182         removeStyle,
183         removeNoScript,
184         removeMeta,
185         removeApplet,
186         removeObject,
187         removeHead,
188         removeOnSomething,
189         false);
190     }
191         
192     /***
193      * Sets the parameters for the HTMLRewriter
194      * @param removeScript Shall SCRIPT-Tags and their content be removed
195      * @param removeStyle Shall STYLE-Tags and their content be removed
196      * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
197      * @param removeMeta Shall META-Tags be removed
198      * @param removeApplet Shall APPLET-Tags and their content be removed
199      * @param removeObject Shall OBJECT-Tags and their content be removed
200      * @param removeHead Shall HEAD-Tags and their content be removed
201      * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
202      */
203     public HTMLRewriter(boolean removeScript,
204                         boolean removeStyle,
205                         boolean removeNoScript,
206                         boolean removeMeta,
207                         boolean removeApplet,
208                         boolean removeObject,
209                         boolean removeHead,
210                         boolean removeOnSomething,
211                         boolean openInNewWindow ) {
212         init ( removeScript,
213         removeStyle,
214         removeNoScript,
215         removeMeta,
216         removeApplet,
217         removeObject,
218         removeHead,
219         removeOnSomething,
220         openInNewWindow ); 
221     }
222 
223     /***
224      * Sets the parameters for the HTMLRewriter
225      *
226      * @param removeScript Shall SCRIPT-Tags and their content be removed
227      * @param removeStyle Shall STYLE-Tags and their content be removed
228      * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
229      * @param removeMeta Shall META-Tags be removed
230      * @param removeApplet Shall APPLET-Tags and their content be removed
231      * @param removeObject Shall OBJECT-Tags and their content be removed
232      * @param removeHead Shall HEAD-Tags and their content be removed
233      * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
234      * @param openInNewWindow Shall links set Target="_blank"
235      */
236     private void init (boolean removeScript,
237                        boolean removeStyle,
238                        boolean removeNoScript,
239                        boolean removeMeta,
240                        boolean removeApplet,
241                        boolean removeObject,
242                        boolean removeHead,
243                        boolean removeOnSomething,
244                        boolean openInNewWindow ) 
245     {
246         cb.removeScript = removeScript;
247         cb.removeStyle = removeStyle; 
248         cb.removeNoScript = removeNoScript;
249         cb.removeMeta = removeMeta;
250         cb.removeApplet = removeApplet;
251         cb.removeObject = removeObject;
252         cb.removeHead = removeHead;
253         cb.removeOnSomething = removeOnSomething;    
254         cb.openInNewWindow = openInNewWindow;    
255     }
256     
257     /***
258      * Does the conversion of the HTML
259      * @param HTMLrdr Reader for HTML to be converted
260      * @param BaseUrl URL from which this HTML was taken. We be the base-Url
261      * for all URL-rewritings.
262      * @throws MalformedURLException If the BaseUrl is not a valid URL or if an URL inside
263      * the document could not be converted. Should not happen
264      * normally, even in badly formatted HTML.
265      * @return HTML-String with rewritten URLs and removed (according
266      * to constructor-settings) tags
267      */
268     public synchronized String convertURLs(Reader HTMLrdr, String BaseUrl) throws MalformedURLException
269     {
270         HTMLEditorKit.Parser parse = new HTMLRewriter.ParserGetter().getParser();        
271         String res ="";
272         try {
273             if (cb.result != null) {
274               cb.result = null;
275               cb.result = new StringWriter();
276             }
277             cb.baseUrl = new URL(BaseUrl);
278             parse.parse(HTMLrdr,cb,true);
279             res = cb.getResult(); 
280         } catch (Exception e)
281         {
282             logger.error( "Unable to convertURLS", e );
283             throw new MalformedURLException(e.toString());
284         }
285         return res;
286     }
287 
288     
289     /*** That Class is needed, because getParser is protected and therefore 
290      *  only accessibly by a subclass
291      */
292     class ParserGetter extends HTMLEditorKit {
293     /*** This is needed, because getParser is protected
294      * @return Html Parser
295      */        
296       public HTMLEditorKit.Parser getParser(){
297         return super.getParser();
298       }
299     } 
300 
301     
302     class Callback extends HTMLEditorKit.ParserCallback {
303 
304         // the base-url of which the given html comes from.
305         private URL baseUrl;
306 
307         // either handling of <FORM> is buggy, or I made some weird mistake ... 
308         // ... JDK 1.3 sends double "</form>"-tags on closing <form>
309         private boolean inForm = false; 
310 
311         
312         // when in multi-part ignored tags (like <script> foobar </script>, 
313         // <style> foobar </style>, a counter for the nesting-level will be
314         // kept here
315         private int ignoreLevel = 0;
316         
317         private boolean removeScript = true;
318         private boolean removeStyle = true; 
319         private boolean removeNoScript = true;
320         private boolean removeMeta = true;
321         private boolean removeApplet = true;
322         private boolean removeObject = true;
323         private boolean removeHead = true;
324         private boolean openInNewWindow = false;
325         
326         // remove the onClick=, onBlur=, etc. - Attributes
327         private boolean removeOnSomething = true;
328         
329         private boolean inScript = false;
330         private boolean inStyle = false;
331         
332         private StringWriter result = new StringWriter();
333         
334         private Callback () {
335         }
336         
337         
338         private Callback addToResult(Object txt)
339         {
340             // to allow for implementation using Stringbuffer or StringWriter
341             // I don't know yet, which one is better in this case
342             if (ignoreLevel > 0) return this;
343 
344             try {
345                 result.write(txt.toString());
346             } catch (Exception e) { /* ignore */ }
347             return this;
348         }
349 
350         private Callback addToResult(char[] txt)
351         {
352             if (ignoreLevel > 0) return this;
353 
354             try {
355                 result.write(txt);
356             } catch (Exception e) { /* ignore */ }
357             return this;
358         }
359         
360         /*** Accessor to the Callback's content-String
361          * @return Cleaned and rewritten HTML-Content
362          */        
363         public String getResult() {
364             try {
365                 result.flush();
366             } catch (Exception e) { /* ignore */ }
367             
368             // WARNING: doesn't work, if you remove " " + ... but don't know why
369             String res = " " + result.toString(); 
370 
371             return res;
372         }
373         
374        
375         public void flush() throws javax.swing.text.BadLocationException {
376             // nothing to do here ...
377         }
378 
379         /*** 
380          * Because Scripts and Stlyle sometimes are defined in comments, thoese
381          * will be written. Otherwise comments are removed
382          */
383         public void handleComment(char[] values,int param) {
384             if ( !( inStyle || inScript))
385                 return;
386 
387             try {
388                 result.write("<!--");
389                 result.write(values);
390                 result.write("-->");
391             } catch (Exception e) { /* ignore */ }
392           // we ignore them 
393         }
394 
395         public void handleEndOfLineString(java.lang.String str) {
396             addToResult("\n");
397         }
398 
399         public void handleError(java.lang.String str,int param) {
400             // ignored
401         }
402 
403         public void handleSimpleTag(HTML.Tag tag,MutableAttributeSet attrs,int param) {
404             if (removeMeta && (tag == HTML.Tag.META)) {
405                 return;
406             }            
407             appendTagToResult(tag,attrs);        
408         }
409 
410         public void handleStartTag(HTML.Tag tag,  MutableAttributeSet attrs, int position) {
411             appendTagToResult(tag,attrs);
412         }
413 
414         public void handleEndTag(HTML.Tag tag, int position) {
415             if ((tag ==HTML.Tag.FORM) && (inForm)) { 
416                 // form handling seems to be buggy
417                 addToResult("</").addToResult(tag).addToResult(">");
418                 inForm = false;
419             } else if (tag == HTML.Tag.FORM) {
420                 // do nothing! ... i.e. we are now outside of any <FORM>, so a
421                 // closing </form> is not really needed ... 
422             } else {
423                 addToResult("</").addToResult(tag).addToResult(">");
424             }
425             
426             
427             if ( (removeScript == false) && (tag == HTML.Tag.SCRIPT)) {
428                 inScript = false;
429             } else if ( (removeStyle == false) && (tag == HTML.Tag.STYLE)) {
430                 inStyle = false;
431             }
432 
433             if ( removeScript && (tag == HTML.Tag.SCRIPT)) {
434                 ignoreLevel --;
435             } else if ( removeStyle && (tag == HTML.Tag.STYLE)) {
436                 ignoreLevel --;
437             } else if ( removeHead && (tag == HTML.Tag.HEAD)) {
438                 ignoreLevel --;
439             } else if ( removeApplet && (tag == HTML.Tag.APPLET)) {
440                 ignoreLevel --;
441             } else if ( removeObject && (tag == HTML.Tag.OBJECT)) {
442                 ignoreLevel --;
443             } else if ( removeNoScript && (tag.toString().equalsIgnoreCase("NOSCRIPT"))) {
444                 ignoreLevel --;
445             }
446         }
447   
448         private void appendTagToResult(HTML.Tag tag, MutableAttributeSet attrs) {
449 
450             if (tag.toString().equalsIgnoreCase("__ENDOFLINETAG__")) {
451                 // jdk 1.2.2 places a tag <__ENDOFLINETAG__> in the result ...
452                 // we don't want this one
453                 return;
454             }
455             
456             if (tag.toString().equalsIgnoreCase("__IMPLIED__")) {
457                 // jdk 1.3 places a tag <__IMPLIED__> in the result ...
458                 // we don't want this one
459                 return;
460             }
461             
462             convertURLS(tag,attrs);
463             Enumeration e = attrs.getAttributeNames();
464             if (tag == HTML.Tag.BASE)
465                 return;
466             
467             addToResult("<").addToResult(tag);
468             while (e.hasMoreElements()) {
469                 Object attr = e.nextElement();
470                 String attrName = attr.toString();
471                 String value = attrs.getAttribute(attr).toString();
472 
473                 // include attribute only when Not(RemoveOnSomething = True and starts with "on")
474                 if (!(removeOnSomething
475                 && attrName.toLowerCase().startsWith("on")
476                 && (attrName.length() > 2))) {
477                     // Attribute included
478                     addToResult(" ").addToResult(attr).addToResult("=\"")
479                     .addToResult(value).addToResult("\"");
480                 }
481             }
482             addToResult(">");
483         }
484                    
485         /*** Here the magic happens.
486          *
487          * If someone wants new types of URLs to be rewritten, add them here
488          * @param tag TAG from the Callback-Interface
489          * @param attrs Attribute-Set from the Callback-Interface
490          */
491         
492         private void convertURLS( HTML.Tag tag, MutableAttributeSet attrs ) {
493 
494            // first we do an URL-rewrite on different tags
495             
496             if (tag == HTML.Tag.A) {
497                 if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
498                     // ---- CHECKING <A HREF
499                     addConvertedAttribute( HTML.Attribute.HREF,
500                     attrs );
501                 }
502                 if ((attrs.getAttribute(HTML.Attribute.TARGET) == null) && cb.openInNewWindow) {
503                     attrs.addAttribute(HTML.Attribute.TARGET, "_BLANK");
504                 }
505             } else if (tag == HTML.Tag.AREA) {
506                 if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
507                     // ---- CHECKING <A HREF
508                     addConvertedAttribute( HTML.Attribute.HREF,
509                     attrs );
510                 }
511                 if ((attrs.getAttribute(HTML.Attribute.TARGET) == null) && cb.openInNewWindow) {
512                     attrs.addAttribute(HTML.Attribute.TARGET, "_BLANK");
513                 }
514             } else if (((tag == HTML.Tag.IMG) || (tag == HTML.Tag.INPUT) || (tag == HTML.Tag.SCRIPT))
515                          && (attrs.getAttribute(HTML.Attribute.SRC) != null)) {
516                 // ---- CHECKING <IMG SRC & <INPUT SRC
517                 addConvertedAttribute( HTML.Attribute.SRC,
518                                        attrs );
519             } else if (tag == HTML.Tag.LINK) {
520                 if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
521                     // ---- CHECKING <LINK HREF
522                     addConvertedAttribute( HTML.Attribute.HREF,
523                     attrs );
524                 }
525             } else if ( tag == HTML.Tag.APPLET ) {
526                 // ---- CHECKING <APPLET CODEBASE=
527                 if (attrs.getAttribute(HTML.Attribute.CODEBASE) == null) {
528                     int endOfPath = baseUrl.toString().lastIndexOf("/");
529                     attrs.addAttribute(HTML.Attribute.CODEBASE, 
530                                        baseUrl.toString().substring(0,endOfPath +1));
531                 } else {
532                     addConvertedAttribute( HTML.Attribute.CODEBASE, attrs );
533                 }
534             } else if (tag == HTML.Tag.OBJECT) {
535                 // ---- CHECKING <OBJECT CODEBASE=
536                 if (attrs.getAttribute(HTML.Attribute.CODEBASE) == null) {
537                     int endOfPath = baseUrl.toString().lastIndexOf("/");
538                     attrs.addAttribute(HTML.Attribute.CODEBASE, 
539                                        baseUrl.toString().substring(0,endOfPath +1));
540                 } else {
541                     addConvertedAttribute( HTML.Attribute.CODEBASE, attrs );
542                 }
543             } else if (tag == HTML.Tag.BODY) {
544                 if (attrs.getAttribute(HTML.Attribute.BACKGROUND) != null) {
545                     // background images are applied to the ENTIRE page, this remove them!
546                     attrs.removeAttribute( HTML.Attribute.BACKGROUND);
547                 }
548             } else if (tag == HTML.Tag.BASE) {
549                 if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
550                     try {
551                         baseUrl = new URL(attrs.getAttribute(HTML.Attribute.HREF).toString());
552                     } catch (Throwable t) {
553                         logger.error( "HTMLRewriter: Setting BASE=" 
554                         + attrs.getAttribute(HTML.Attribute.HREF).toString()
555                         + t.getMessage());
556                     }
557                     attrs.removeAttribute(HTML.Attribute.HREF);
558                 }
559             } else if (tag == HTML.Tag.FORM) {
560                 // ---- CHECKING <FORM ACTION=
561                   inForm = true; // buggy <form> handling in jdk 1.3 
562                   if (attrs.getAttribute(HTML.Attribute.ACTION) == null) {
563                       //self referencing <FORM>
564                        attrs.addAttribute(HTML.Attribute.ACTION,
565                                           baseUrl.toString());
566                   } else {
567                         addConvertedAttribute( HTML.Attribute.ACTION,
568                                                attrs );
569                   }
570             } else if (tag == HTML.Tag.TD) {
571                 // ---- CHECKING <TD BACKGROUND=
572                   if (! (attrs.getAttribute(HTML.Attribute.BACKGROUND) == null)) {
573                       addConvertedAttribute( HTML.Attribute.BACKGROUND,
574                                              attrs );
575                   }
576             }
577 
578             
579             // then we check for ignored tags ...
580             // btw. I know, that this code could be written in a shorter way, but
581             // I think it's more readable like this ...
582 
583             // don't forget to add changes to  handleEndTag() as well, else 
584             // things will get screwed up!
585             
586             if ( (removeScript == false) && (tag == HTML.Tag.SCRIPT)) {
587                 inScript = true;
588             } else if ( (removeStyle == false) && (tag == HTML.Tag.STYLE)) {
589                 inStyle = true;
590             }
591 
592             if ( removeScript && (tag == HTML.Tag.SCRIPT)) {
593                   ignoreLevel ++;
594             } else if ( removeStyle && (tag == HTML.Tag.STYLE)) {
595                   ignoreLevel ++;
596             } else if ( removeHead && (tag == HTML.Tag.HEAD)) {
597                   ignoreLevel ++;
598             } else if ( removeApplet && (tag == HTML.Tag.APPLET)) {
599                   ignoreLevel ++;
600             } else if ( removeObject && (tag == HTML.Tag.OBJECT)) {
601                   ignoreLevel ++;
602             } else if (removeNoScript && (tag.toString().equalsIgnoreCase("NOSCRIPT"))) {
603                   ignoreLevel ++;
604             }
605         }
606 
607         /***
608          *
609          * Converts the given attribute to base URL, if not null
610          *
611          */
612         private void addConvertedAttribute( HTML.Attribute attr,
613                                             MutableAttributeSet attrs ) {
614             if( attrs.getAttribute( attr ) != null ) {
615                 String attrSource =  attrs.getAttribute( attr ).toString();
616                 attrs.addAttribute( attr,
617                                     generateNewUrl( attrSource ) );
618             }
619         }
620               
621               
622         private String generateNewUrl(String oldURL) {
623             try {
624                 URL x = new URL(baseUrl,oldURL);
625                 return x.toString();
626             } catch (Throwable t) {
627                 if (oldURL.toLowerCase().startsWith("javascript:")) {
628                     return oldURL;
629                 }
630                 logger.error( "HTMLRewriter: Setting BASE="
631                 + baseUrl
632                 + " Old = "
633                 + oldURL
634                 + t.getMessage());
635                 return oldURL; // default behaviour ...
636             }
637         }
638 
639         public void handleText(char[] values,int param) {
640             addToResult(values);
641         }
642     }
643 }