1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131 package org.apache.jetspeed.util;
132
133 import java.io.Reader;
134 import java.io.StringWriter;
135 import java.net.MalformedURLException;
136 import java.net.URL;
137 import java.util.Enumeration;
138 import javax.swing.text.html.HTML;
139 import javax.swing.text.html.HTMLEditorKit;
140 import javax.swing.text.MutableAttributeSet;
141
142
143 import org.apache.jetspeed.services.logging.JetspeedLogFactoryService;
144 import org.apache.jetspeed.services.logging.JetspeedLogger;
145
146 /***
147 *
148 * @author Ingo Rammer (rammer@sycom.at)
149 * @author <a href="mailto:sgala@apache.org">Santiago Gala</a>
150 * @author <a href="mailto:paulsp@apache.org">Paul Spencer</a>
151 * @version 0.2
152 */
153
154 public class HTMLRewriter
155 {
156 /***
157 * Static initialization of the logger for this class
158 */
159 private static final JetspeedLogger logger = JetspeedLogFactoryService.getLogger(HTMLRewriter.class.getName());
160
161 private HTMLRewriter.Callback cb = new HTMLRewriter.Callback();
162
163 /*** Sets the parameters for the HTMLRewriter
164 * @param removeScript Shall SCRIPT-Tags and their content be removed
165 * @param removeStyle Shall STYLE-Tags and their content be removed
166 * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
167 * @param removeMeta Shall META-Tags be removed
168 * @param removeApplet Shall APPLET-Tags and their content be removed
169 * @param removeObject Shall OBJECT-Tags and their content be removed
170 * @param removeHead Shall HEAD-Tags and their content be removed
171 * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
172 */
173 public HTMLRewriter(boolean removeScript,
174 boolean removeStyle,
175 boolean removeNoScript,
176 boolean removeMeta,
177 boolean removeApplet,
178 boolean removeObject,
179 boolean removeHead,
180 boolean removeOnSomething) {
181 init ( removeScript,
182 removeStyle,
183 removeNoScript,
184 removeMeta,
185 removeApplet,
186 removeObject,
187 removeHead,
188 removeOnSomething,
189 false);
190 }
191
192 /***
193 * Sets the parameters for the HTMLRewriter
194 * @param removeScript Shall SCRIPT-Tags and their content be removed
195 * @param removeStyle Shall STYLE-Tags and their content be removed
196 * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
197 * @param removeMeta Shall META-Tags be removed
198 * @param removeApplet Shall APPLET-Tags and their content be removed
199 * @param removeObject Shall OBJECT-Tags and their content be removed
200 * @param removeHead Shall HEAD-Tags and their content be removed
201 * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
202 */
203 public HTMLRewriter(boolean removeScript,
204 boolean removeStyle,
205 boolean removeNoScript,
206 boolean removeMeta,
207 boolean removeApplet,
208 boolean removeObject,
209 boolean removeHead,
210 boolean removeOnSomething,
211 boolean openInNewWindow ) {
212 init ( removeScript,
213 removeStyle,
214 removeNoScript,
215 removeMeta,
216 removeApplet,
217 removeObject,
218 removeHead,
219 removeOnSomething,
220 openInNewWindow );
221 }
222
223 /***
224 * Sets the parameters for the HTMLRewriter
225 *
226 * @param removeScript Shall SCRIPT-Tags and their content be removed
227 * @param removeStyle Shall STYLE-Tags and their content be removed
228 * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
229 * @param removeMeta Shall META-Tags be removed
230 * @param removeApplet Shall APPLET-Tags and their content be removed
231 * @param removeObject Shall OBJECT-Tags and their content be removed
232 * @param removeHead Shall HEAD-Tags and their content be removed
233 * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
234 * @param openInNewWindow Shall links set Target="_blank"
235 */
236 private void init (boolean removeScript,
237 boolean removeStyle,
238 boolean removeNoScript,
239 boolean removeMeta,
240 boolean removeApplet,
241 boolean removeObject,
242 boolean removeHead,
243 boolean removeOnSomething,
244 boolean openInNewWindow )
245 {
246 cb.removeScript = removeScript;
247 cb.removeStyle = removeStyle;
248 cb.removeNoScript = removeNoScript;
249 cb.removeMeta = removeMeta;
250 cb.removeApplet = removeApplet;
251 cb.removeObject = removeObject;
252 cb.removeHead = removeHead;
253 cb.removeOnSomething = removeOnSomething;
254 cb.openInNewWindow = openInNewWindow;
255 }
256
257 /***
258 * Does the conversion of the HTML
259 * @param HTMLrdr Reader for HTML to be converted
260 * @param BaseUrl URL from which this HTML was taken. We be the base-Url
261 * for all URL-rewritings.
262 * @throws MalformedURLException If the BaseUrl is not a valid URL or if an URL inside
263 * the document could not be converted. Should not happen
264 * normally, even in badly formatted HTML.
265 * @return HTML-String with rewritten URLs and removed (according
266 * to constructor-settings) tags
267 */
268 public synchronized String convertURLs(Reader HTMLrdr, String BaseUrl) throws MalformedURLException
269 {
270 HTMLEditorKit.Parser parse = new HTMLRewriter.ParserGetter().getParser();
271 String res ="";
272 try {
273 if (cb.result != null) {
274 cb.result = null;
275 cb.result = new StringWriter();
276 }
277 cb.baseUrl = new URL(BaseUrl);
278 parse.parse(HTMLrdr,cb,true);
279 res = cb.getResult();
280 } catch (Exception e)
281 {
282 logger.error( "Unable to convertURLS", e );
283 throw new MalformedURLException(e.toString());
284 }
285 return res;
286 }
287
288
289 /*** That Class is needed, because getParser is protected and therefore
290 * only accessibly by a subclass
291 */
292 class ParserGetter extends HTMLEditorKit {
293 /*** This is needed, because getParser is protected
294 * @return Html Parser
295 */
296 public HTMLEditorKit.Parser getParser(){
297 return super.getParser();
298 }
299 }
300
301
302 class Callback extends HTMLEditorKit.ParserCallback {
303
304
305 private URL baseUrl;
306
307
308
309 private boolean inForm = false;
310
311
312
313
314
315 private int ignoreLevel = 0;
316
317 private boolean removeScript = true;
318 private boolean removeStyle = true;
319 private boolean removeNoScript = true;
320 private boolean removeMeta = true;
321 private boolean removeApplet = true;
322 private boolean removeObject = true;
323 private boolean removeHead = true;
324 private boolean openInNewWindow = false;
325
326
327 private boolean removeOnSomething = true;
328
329 private boolean inScript = false;
330 private boolean inStyle = false;
331
332 private StringWriter result = new StringWriter();
333
334 private Callback () {
335 }
336
337
338 private Callback addToResult(Object txt)
339 {
340
341
342 if (ignoreLevel > 0) return this;
343
344 try {
345 result.write(txt.toString());
346 } catch (Exception e) {
347 return this;
348 }
349
350 private Callback addToResult(char[] txt)
351 {
352 if (ignoreLevel > 0) return this;
353
354 try {
355 result.write(txt);
356 } catch (Exception e) {
357 return this;
358 }
359
360 /*** Accessor to the Callback's content-String
361 * @return Cleaned and rewritten HTML-Content
362 */
363 public String getResult() {
364 try {
365 result.flush();
366 } catch (Exception e) {
367
368
369 String res = " " + result.toString();
370
371 return res;
372 }
373
374
375 public void flush() throws javax.swing.text.BadLocationException {
376
377 }
378
379 /***
380 * Because Scripts and Stlyle sometimes are defined in comments, thoese
381 * will be written. Otherwise comments are removed
382 */
383 public void handleComment(char[] values,int param) {
384 if ( !( inStyle || inScript))
385 return;
386
387 try {
388 result.write("<!--");
389 result.write(values);
390 result.write("-->");
391 } catch (Exception e) {
392
393 }
394
395 public void handleEndOfLineString(java.lang.String str) {
396 addToResult("\n");
397 }
398
399 public void handleError(java.lang.String str,int param) {
400
401 }
402
403 public void handleSimpleTag(HTML.Tag tag,MutableAttributeSet attrs,int param) {
404 if (removeMeta && (tag == HTML.Tag.META)) {
405 return;
406 }
407 appendTagToResult(tag,attrs);
408 }
409
410 public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrs, int position) {
411 appendTagToResult(tag,attrs);
412 }
413
414 public void handleEndTag(HTML.Tag tag, int position) {
415 if ((tag ==HTML.Tag.FORM) && (inForm)) {
416
417 addToResult("</").addToResult(tag).addToResult(">");
418 inForm = false;
419 } else if (tag == HTML.Tag.FORM) {
420
421
422 } else {
423 addToResult("</").addToResult(tag).addToResult(">");
424 }
425
426
427 if ( (removeScript == false) && (tag == HTML.Tag.SCRIPT)) {
428 inScript = false;
429 } else if ( (removeStyle == false) && (tag == HTML.Tag.STYLE)) {
430 inStyle = false;
431 }
432
433 if ( removeScript && (tag == HTML.Tag.SCRIPT)) {
434 ignoreLevel --;
435 } else if ( removeStyle && (tag == HTML.Tag.STYLE)) {
436 ignoreLevel --;
437 } else if ( removeHead && (tag == HTML.Tag.HEAD)) {
438 ignoreLevel --;
439 } else if ( removeApplet && (tag == HTML.Tag.APPLET)) {
440 ignoreLevel --;
441 } else if ( removeObject && (tag == HTML.Tag.OBJECT)) {
442 ignoreLevel --;
443 } else if ( removeNoScript && (tag.toString().equalsIgnoreCase("NOSCRIPT"))) {
444 ignoreLevel --;
445 }
446 }
447
448 private void appendTagToResult(HTML.Tag tag, MutableAttributeSet attrs) {
449
450 if (tag.toString().equalsIgnoreCase("__ENDOFLINETAG__")) {
451
452
453 return;
454 }
455
456 if (tag.toString().equalsIgnoreCase("__IMPLIED__")) {
457
458
459 return;
460 }
461
462 convertURLS(tag,attrs);
463 Enumeration e = attrs.getAttributeNames();
464 if (tag == HTML.Tag.BASE)
465 return;
466
467 addToResult("<").addToResult(tag);
468 while (e.hasMoreElements()) {
469 Object attr = e.nextElement();
470 String attrName = attr.toString();
471 String value = attrs.getAttribute(attr).toString();
472
473
474 if (!(removeOnSomething
475 && attrName.toLowerCase().startsWith("on")
476 && (attrName.length() > 2))) {
477
478 addToResult(" ").addToResult(attr).addToResult("=\"")
479 .addToResult(value).addToResult("\"");
480 }
481 }
482 addToResult(">");
483 }
484
485 /*** Here the magic happens.
486 *
487 * If someone wants new types of URLs to be rewritten, add them here
488 * @param tag TAG from the Callback-Interface
489 * @param attrs Attribute-Set from the Callback-Interface
490 */
491
492 private void convertURLS( HTML.Tag tag, MutableAttributeSet attrs ) {
493
494
495
496 if (tag == HTML.Tag.A) {
497 if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
498
499 addConvertedAttribute( HTML.Attribute.HREF,
500 attrs );
501 }
502 if ((attrs.getAttribute(HTML.Attribute.TARGET) == null) && cb.openInNewWindow) {
503 attrs.addAttribute(HTML.Attribute.TARGET, "_BLANK");
504 }
505 } else if (tag == HTML.Tag.AREA) {
506 if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
507
508 addConvertedAttribute( HTML.Attribute.HREF,
509 attrs );
510 }
511 if ((attrs.getAttribute(HTML.Attribute.TARGET) == null) && cb.openInNewWindow) {
512 attrs.addAttribute(HTML.Attribute.TARGET, "_BLANK");
513 }
514 } else if (((tag == HTML.Tag.IMG) || (tag == HTML.Tag.INPUT) || (tag == HTML.Tag.SCRIPT))
515 && (attrs.getAttribute(HTML.Attribute.SRC) != null)) {
516
517 addConvertedAttribute( HTML.Attribute.SRC,
518 attrs );
519 } else if (tag == HTML.Tag.LINK) {
520 if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
521
522 addConvertedAttribute( HTML.Attribute.HREF,
523 attrs );
524 }
525 } else if ( tag == HTML.Tag.APPLET ) {
526
527 if (attrs.getAttribute(HTML.Attribute.CODEBASE) == null) {
528 int endOfPath = baseUrl.toString().lastIndexOf("/");
529 attrs.addAttribute(HTML.Attribute.CODEBASE,
530 baseUrl.toString().substring(0,endOfPath +1));
531 } else {
532 addConvertedAttribute( HTML.Attribute.CODEBASE, attrs );
533 }
534 } else if (tag == HTML.Tag.OBJECT) {
535
536 if (attrs.getAttribute(HTML.Attribute.CODEBASE) == null) {
537 int endOfPath = baseUrl.toString().lastIndexOf("/");
538 attrs.addAttribute(HTML.Attribute.CODEBASE,
539 baseUrl.toString().substring(0,endOfPath +1));
540 } else {
541 addConvertedAttribute( HTML.Attribute.CODEBASE, attrs );
542 }
543 } else if (tag == HTML.Tag.BODY) {
544 if (attrs.getAttribute(HTML.Attribute.BACKGROUND) != null) {
545
546 attrs.removeAttribute( HTML.Attribute.BACKGROUND);
547 }
548 } else if (tag == HTML.Tag.BASE) {
549 if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
550 try {
551 baseUrl = new URL(attrs.getAttribute(HTML.Attribute.HREF).toString());
552 } catch (Throwable t) {
553 logger.error( "HTMLRewriter: Setting BASE="
554 + attrs.getAttribute(HTML.Attribute.HREF).toString()
555 + t.getMessage());
556 }
557 attrs.removeAttribute(HTML.Attribute.HREF);
558 }
559 } else if (tag == HTML.Tag.FORM) {
560
561 inForm = true;
562 if (attrs.getAttribute(HTML.Attribute.ACTION) == null) {
563
564 attrs.addAttribute(HTML.Attribute.ACTION,
565 baseUrl.toString());
566 } else {
567 addConvertedAttribute( HTML.Attribute.ACTION,
568 attrs );
569 }
570 } else if (tag == HTML.Tag.TD) {
571
572 if (! (attrs.getAttribute(HTML.Attribute.BACKGROUND) == null)) {
573 addConvertedAttribute( HTML.Attribute.BACKGROUND,
574 attrs );
575 }
576 }
577
578
579
580
581
582
583
584
585
586 if ( (removeScript == false) && (tag == HTML.Tag.SCRIPT)) {
587 inScript = true;
588 } else if ( (removeStyle == false) && (tag == HTML.Tag.STYLE)) {
589 inStyle = true;
590 }
591
592 if ( removeScript && (tag == HTML.Tag.SCRIPT)) {
593 ignoreLevel ++;
594 } else if ( removeStyle && (tag == HTML.Tag.STYLE)) {
595 ignoreLevel ++;
596 } else if ( removeHead && (tag == HTML.Tag.HEAD)) {
597 ignoreLevel ++;
598 } else if ( removeApplet && (tag == HTML.Tag.APPLET)) {
599 ignoreLevel ++;
600 } else if ( removeObject && (tag == HTML.Tag.OBJECT)) {
601 ignoreLevel ++;
602 } else if (removeNoScript && (tag.toString().equalsIgnoreCase("NOSCRIPT"))) {
603 ignoreLevel ++;
604 }
605 }
606
607 /***
608 *
609 * Converts the given attribute to base URL, if not null
610 *
611 */
612 private void addConvertedAttribute( HTML.Attribute attr,
613 MutableAttributeSet attrs ) {
614 if( attrs.getAttribute( attr ) != null ) {
615 String attrSource = attrs.getAttribute( attr ).toString();
616 attrs.addAttribute( attr,
617 generateNewUrl( attrSource ) );
618 }
619 }
620
621
622 private String generateNewUrl(String oldURL) {
623 try {
624 URL x = new URL(baseUrl,oldURL);
625 return x.toString();
626 } catch (Throwable t) {
627 if (oldURL.toLowerCase().startsWith("javascript:")) {
628 return oldURL;
629 }
630 logger.error( "HTMLRewriter: Setting BASE="
631 + baseUrl
632 + " Old = "
633 + oldURL
634 + t.getMessage());
635 return oldURL;
636 }
637 }
638
639 public void handleText(char[] values,int param) {
640 addToResult(values);
641 }
642 }
643 }