1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.jetspeed.services.urlmanager;
18
19
20 import java.io.BufferedInputStream;
21 import java.io.BufferedReader;
22 import java.io.File;
23 import java.io.InputStreamReader;
24 import java.io.IOException;
25 import java.io.Reader;
26 import java.io.UnsupportedEncodingException;
27 import java.net.HttpURLConnection;
28 import java.net.MalformedURLException;
29 import java.net.URL;
30 import java.net.URLConnection;
31 import java.util.Hashtable;
32 import java.util.Vector;
33
34
35 import org.apache.jetspeed.services.resources.JetspeedResources;
36
37
38 import org.apache.jetspeed.cache.disk.DiskCacheEntry;
39 import org.apache.jetspeed.cache.disk.DiskCacheUtils;
40 import org.apache.jetspeed.cache.disk.JetspeedDiskCache;
41 import org.apache.jetspeed.services.logging.JetspeedLogFactoryService;
42 import org.apache.jetspeed.services.logging.JetspeedLogger;
43
44 /***
45 <p>
46 Handles fetching URLs and if for some reason anything happens add it to the
47 BadURLManager. There are also some util methods for downloading URLs that don't
48 use the Disk Cache.
49 </p>
50
51
52
53 @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
54 @author <a href="mailto:sgala@hisitech.com">Santiago Gala</a>
55 @version $Id: URLFetcher.java,v 1.14 2004/02/23 03:30:47 jford Exp $
56 */
57 public class URLFetcher
58 {
59 /***
60 * Static initialization of the logger for this class
61 */
62 private static final JetspeedLogger logger = JetspeedLogFactoryService.getLogger(URLFetcher.class.getName());
63
64 /***
65 URLs that Jetspeed is currently trying to fetch in real time.
66 */
67 private static Hashtable realtime_urls = new Hashtable();
68
69 /***
70 *
71 */
72 static final boolean shouldFetchNow =
73 JetspeedResources.getBoolean( JetspeedResources.CACHE_REQUIRE_CACHED_KEY );
74
75 static {
76
77 java.net.HttpURLConnection.setFollowRedirects(true);
78 }
79
80 public static final Reader fetch( String url ) throws IOException {
81 return fetch ( url, false );
82 }
83
84
85 /***
86 Try and fetch a URL as and get the content as a String and possibly add
87 the URL to the BadURLManager if anything goes wrong.
88
89 @param url The URL to fetch
90 @param force if set to true then do not use force this entry to be in the cache...
91 IE do not use CACHE_REQUIRE_CACHED
92 */
93 public static final Reader fetch( String url,
94 boolean force ) throws IOException {
95
96 if ( ! URLManager.isOK( url ) ) {
97 throw new URLNotAvailableException( url );
98 }
99
100
101 if( force == false && DiskCacheUtils.isCached( url ) == true)
102 {
103 logger.info( "The url " +
104 url +
105 " is fetched from the Cache" );
106 return JetspeedDiskCache.getInstance().getEntry( url ).getReader();
107 }
108
109
110 if ( shouldFetchNow &&
111 DiskCacheUtils.isCached( url ) == false &&
112 isRealtimeURL( url ) == false &&
113 force == false ) {
114
115 logger.info( "The url " +
116 url +
117 " is not in the cache and will be fetched now because you have configured -> " +
118 JetspeedResources.CACHE_REQUIRE_CACHED_KEY );
119
120
121
122 JetspeedDiskCache.getInstance().refresh( url );
123
124
125 throw new ContentNotAvailableException( url );
126 }
127
128 if( isRealtimeURL( url ) == true ) {
129 addRealtimeURL( url );
130 synchronized(url.intern())
131 {
132 try
133 {
134
135 url.intern().wait();
136 } catch (InterruptedException e)
137 {
138 logger.info("Wait Interrupted");
139 } finally
140 {
141 removeRealtimeURL( url );
142 }
143 }
144
145 return URLFetcher.fetch( url, force );
146 } else {
147 addRealtimeURL( url );
148 }
149 try {
150
151 URL content;
152
153
154 String protocol = url.substring(0, url.indexOf(":/"));
155
156
157 String proxyHost = URLManager.getProxyHost( protocol );
158 if (proxyHost != null)
159 {
160
161 content = new URL(protocol,
162 proxyHost,
163 URLManager.getProxyPort( protocol ),
164 url);
165 }
166 else
167 {
168 content = new URL( url );
169 }
170
171 URLConnection conn = content.openConnection();
172 return getReader( conn );
173
174 } catch ( Throwable t ) {
175
176 String reason = "";
177
178 if ( t instanceof MalformedURLException ) {
179 reason = "The URL is Malformed.";
180 } else {
181 reason = t.toString();
182 }
183
184
185
186 if ( DiskCacheUtils.isCached( url ) == false ) {
187
188
189 URLManager.register( url, URLManagerService.STATUS_BAD, reason );
190 } else {
191
192
193 JetspeedDiskCache.getInstance().remove(url);
194 }
195
196
197 throw new URLNotAvailableException( reason, url );
198
199 } finally {
200 removeRealtimeURL( url );
201 }
202
203 }
204
205
206 /***
207 Try and fetch a URL if the copy in the cache has expired and add
208 the URL to the BadURLManager if anything goes wrong.
209
210 @param url The URL to fetch
211 @param force if set to true then do not use force this entry to be in the cache...
212 IE do not use CACHE_REQUIRE_CACHED
213 */
214 public static final boolean refresh( String url) throws IOException {
215
216 if ( ! URLManager.isOK( url ) ) {
217 if( DiskCacheUtils.isCached(url) )
218 JetspeedDiskCache.getInstance().remove(url);
219 throw new URLNotAvailableException( url );
220 }
221
222 if(isRealtimeURL(url)) {
223 return false;
224 }
225
226
227 DiskCacheEntry dce = null;
228 if( DiskCacheUtils.isCached(url) ) {
229 try {
230 dce = JetspeedDiskCache.getInstance().getEntry( url );
231 if(!dce.hasExpired())
232 {
233 return false;
234 }
235 addRealtimeURL( url );
236
237
238
239 URL sock;
240
241
242 String protocol = url.substring(0, url.indexOf(":/"));
243
244
245 String proxyHost = URLManager.getProxyHost( protocol );
246 if (proxyHost != null)
247 {
248
249 sock = new URL(protocol,
250 proxyHost,
251 URLManager.getProxyPort( protocol ),
252 url);
253 }
254 else
255 {
256 sock = new URL( url );
257 }
258
259 URLConnection conn = null;
260 conn = sock.openConnection();
261
262 File file = dce.getFile();
263 long mod = dce.getLastModified();
264 long filesize = 0;
265 if(file != null)
266 {
267 filesize = file.length();
268 }
269
270 if(mod > 0 || filesize > 0)
271 conn.setIfModifiedSince(mod);
272
273 conn.connect();
274 long last = conn.getLastModified();
275 long expires = conn.getExpiration();
276 int clength = conn.getContentLength();
277 int respCode = 200;
278 if(conn instanceof HttpURLConnection) {
279 respCode = ( ( HttpURLConnection )conn ).getResponseCode();
280 }
281
282 if (respCode != 304
283 (clength == -1 || clength > 0) &&
284 ( last == 0 ||
285 last > dce.getLastModified()) ) {
286
287 logger.info( "URLFetcher: Found updated URL: " +
288 url +
289 " Modified " + last + " Expires: " + expires +
290 " CLength: " + clength );
291
292
293
294 JetspeedDiskCache.getInstance().getEntry( url, getReader( conn ) );
295
296 conn.getInputStream().close();
297
298
299
300 if(last > 0)
301 dce.setLastModified(last);
302 else
303 dce.setLastModified( System.currentTimeMillis() );
304 dce.setExpirationTime(expires);
305
306
307
308 return true;
309
310
311 } else {
312
313 if(last > 0)
314 dce.setLastModified(last);
315 else
316 dce.setLastModified( System.currentTimeMillis() );
317 dce.setExpirationTime(expires);
318
319
320 logger.info( "DiskCacheDaemon: URL still valid: " + url +
321 " Modified " + last + " Expires: " + expires +
322 " CLength: " + clength);
323
324 return false;
325 }
326 } catch (Throwable e) {
327
328 logger.error("Throwable", e);
329 URLManager.register( url,
330 URLManagerService.STATUS_BAD,
331 e.toString() );
332 } finally {
333 removeRealtimeURL( url );
334 }
335
336 } else {
337 logger.info( "URLFetcher: Cache miss during validation! Forcing url: " + url );
338 removeRealtimeURL( url );
339 JetspeedDiskCache.getInstance().getEntry( url, true );
340 return true;
341 }
342 return false;
343
344 }
345
346
347 /***
348 *
349 * Return a Reader for a given HTTP connection.
350 * If the connection first line contains a XML declaration
351 * with encoding, honor this encoding.
352 * If not, use the encoding from the HTTP connection,
353 * taking ISO-8859-1 as default.
354 *
355 */
356 static final Reader getReader( URLConnection conn )
357 throws IOException, UnsupportedEncodingException {
358 String enc = conn.getContentEncoding();
359 if( enc == null ) {
360 enc = "ISO-8859-1";
361 }
362
363
364
365
366 BufferedInputStream is = new BufferedInputStream( conn.getInputStream() );
367
368
369
370
371
372 is.mark( 20480 );
373 BufferedReader asciiReader = new BufferedReader( new InputStreamReader( is, "ASCII" ) );
374 String decl = asciiReader.readLine();
375
376 String key = "encoding=\"";
377
378 if( decl != null ) {
379 int off = decl.indexOf( key );
380 if( off > 0 ) {
381 enc = decl.substring( off + key.length(),
382 decl.indexOf( '"' , off + key.length()) );
383 }
384 }
385 logger.info("URLFetcher: found URL with encoding -> " + enc );
386
387 is.reset();
388 Reader rdr = new InputStreamReader( is,
389 enc );
390 return rdr;
391 }
392
393
394
395 /***
396 Add a URL that is downloading in realtime
397 */
398 static final void addRealtimeURL( String url ) {
399 synchronized( realtime_urls )
400 {
401 Vector threads = (Vector) realtime_urls.get( url);
402 if(threads != null)
403 {
404 if(!threads.contains(Thread.currentThread()))
405 {
406 threads.addElement(Thread.currentThread() );
407 }
408 } else {
409 threads = new Vector();
410 threads.addElement(Thread.currentThread());
411 realtime_urls.put( url, threads );
412 }
413 }
414
415 }
416
417 /***
418 Remove a URL because it isn't downloading anymore.
419 */
420 static final void removeRealtimeURL( String url ) {
421 synchronized( realtime_urls )
422 {
423 Vector threads = (Vector) realtime_urls.get( url);
424 if(threads != null)
425 synchronized( threads )
426 {
427 Thread realLoader = (Thread) threads.firstElement();
428 if(realLoader == Thread.currentThread())
429 {
430 synchronized(url.intern())
431 {
432 realtime_urls.remove(url);
433 url.intern().notifyAll();
434 }
435 } else {
436 threads.removeElement(Thread.currentThread());
437 }
438 }
439 }
440
441 }
442
443 /***
444 Return true if this URL isn't downloading in realtime.
445 */
446 static final boolean isRealtimeURL( String url ) {
447
448 synchronized( realtime_urls ) {
449 return realtime_urls.get( url ) != null;
450 }
451
452 }
453
454 /***
455 Return the list of realtime URLs for debug
456 */
457 public static final Hashtable getRealtimeURLs() {
458 synchronized(realtime_urls) {
459 return realtime_urls;
460 }
461 }
462
463 }