1/*2 * Copyright 2000-2004 The Apache Software Foundation.3 * 4 * Licensed under the Apache License, Version 2.0 (the "License");5 * you may not use this file except in compliance with the License.6 * You may obtain a copy of the License at7 * 8 * http://www.apache.org/licenses/LICENSE-2.09 * 10 * Unless required by applicable law or agreed to in writing, software11 * distributed under the License is distributed on an "AS IS" BASIS,12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.13 * See the License for the specific language governing permissions and14 * limitations under the License.15 */1617packageorg.apache.jetspeed.services.urlmanager;
1819//standard Java stuff20import java.io.BufferedInputStream;
21import java.io.BufferedReader;
22import java.io.File;
23import java.io.InputStreamReader;
24import java.io.IOException;
25import java.io.Reader;
26import java.io.UnsupportedEncodingException;
27import java.net.HttpURLConnection;
28import java.net.MalformedURLException;
29import java.net.URL;
30import java.net.URLConnection;
31import java.util.Hashtable;
32import java.util.Vector;
3334//turbine stuff35import org.apache.jetspeed.services.resources.JetspeedResources;
3637//jetspeed stuff38import org.apache.jetspeed.cache.disk.DiskCacheEntry;
39import org.apache.jetspeed.cache.disk.DiskCacheUtils;
40import org.apache.jetspeed.cache.disk.JetspeedDiskCache;
41import org.apache.jetspeed.services.logging.JetspeedLogFactoryService;
42import org.apache.jetspeed.services.logging.JetspeedLogger;
4344/***45<p>46Handles fetching URLs and if for some reason anything happens add it to the47BadURLManager. There are also some util methods for downloading URLs that don't48use the Disk Cache.49</p>50515253@author <a href="mailto:burton@apache.org">Kevin A. Burton</a>54@author <a href="mailto:sgala@hisitech.com">Santiago Gala</a>55@version $Id: URLFetcher.java,v 1.14 2004/02/23 03:30:47 jford Exp $56*/57publicclassURLFetcher58 {
59/***60 * Static initialization of the logger for this class61 */62privatestaticfinalJetspeedLogger logger = JetspeedLogFactoryService.getLogger(URLFetcher.class.getName());
6364/***65 URLs that Jetspeed is currently trying to fetch in real time.66 */67privatestatic Hashtable realtime_urls = new Hashtable();
6869/***70 *71 */72staticfinalboolean shouldFetchNow =
73 JetspeedResources.getBoolean( JetspeedResources.CACHE_REQUIRE_CACHED_KEY );
7475static {
76//Looking for redirected channels...77 java.net.HttpURLConnection.setFollowRedirects(true);
78 }
7980publicstaticfinal Reader fetch( String url ) throws IOException {
81return fetch ( url, false );
82 }
838485/***86 Try and fetch a URL as and get the content as a String and possibly add87 the URL to the BadURLManager if anything goes wrong.8889 @param url The URL to fetch90 @param force if set to true then do not use force this entry to be in the cache...91 IE do not use CACHE_REQUIRE_CACHED92 */93publicstaticfinal Reader fetch( String url,
94boolean force ) throws IOException {
9596if ( ! URLManager.isOK( url ) ) {
97thrownewURLNotAvailableException( url );
98 }
99100//SGP101if( force == false && DiskCacheUtils.isCached( url ) == true)
102 {
103 logger.info( "The url " +
104 url +
105" is fetched from the Cache" );
106return JetspeedDiskCache.getInstance().getEntry( url ).getReader();
107 }
108109//do cache required checking110if ( shouldFetchNow &&
111 DiskCacheUtils.isCached( url ) == false &&
112 isRealtimeURL( url ) == false &&
113 force == false ) {
114115 logger.info( "The url " +
116 url +
117" is not in the cache and will be fetched now because you have configured -> " +
118 JetspeedResources.CACHE_REQUIRE_CACHED_KEY );
119120//it is possible that two thread request the same URL.121//The refresh call in JetspeedDiskCache takes care of this.122 JetspeedDiskCache.getInstance().refresh( url );
123124//thow an Exception that this isn't in the cache.125thrownewContentNotAvailableException( url );
126 }
127128if( isRealtimeURL( url ) == true ) {
129 addRealtimeURL( url );
130synchronized(url.intern())
131 {
132try133 {
134//We wait for other thread to load135 url.intern().wait();
136 } catch (InterruptedException e)
137 {
138 logger.info("Wait Interrupted");
139 } finally140 {
141 removeRealtimeURL( url );
142 }
143 }
144// We try again145return URLFetcher.fetch( url, force );
146 } else {
147 addRealtimeURL( url );
148 }
149try {
150151 URL content;
152153// Determine the URL's protocol154 String protocol = url.substring(0, url.indexOf(":/"));
155156// Check if a proxy is set. If no port is set, use the default port (-1)157 String proxyHost = URLManager.getProxyHost( protocol );
158if (proxyHost != null)
159 {
160// Open the URL using a proxy161 content = new URL(protocol,
162 proxyHost,
163 URLManager.getProxyPort( protocol ),
164 url);
165 }
166else167 {
168 content = new URL( url );
169 }
170171 URLConnection conn = content.openConnection();
172return getReader( conn );
173174 } catch ( Throwable t ) {
175176 String reason = "";
177178if ( t instanceof MalformedURLException ) {
179 reason = "The URL is Malformed.";
180 } else {
181 reason = t.toString();
182 }
183184//if the URL couldn't be fetched because it is remote AND185//it is not in the cache, add it to the bad URL list.186if ( DiskCacheUtils.isCached( url ) == false ) {
187//Reported up there...188//logger.error( t );189 URLManager.register( url, URLManagerService.STATUS_BAD, reason );
190 } else {
191//it is in the cache, remove it (could be broken in cache).192//next time we could be luckier.193 JetspeedDiskCache.getInstance().remove(url);
194 }
195196197thrownewURLNotAvailableException( reason, url );
198199 } finally {
200 removeRealtimeURL( url );
201 }
202203 }
204205206/***207 Try and fetch a URL if the copy in the cache has expired and add208 the URL to the BadURLManager if anything goes wrong.209210 @param url The URL to fetch211 @param force if set to true then do not use force this entry to be in the cache...212 IE do not use CACHE_REQUIRE_CACHED213 */214publicstaticfinalboolean refresh( String url) throws IOException {
215216if ( ! URLManager.isOK( url ) ) {
217if( DiskCacheUtils.isCached(url) )
218 JetspeedDiskCache.getInstance().remove(url);
219thrownewURLNotAvailableException( url );
220 }
221222if(isRealtimeURL(url)) {
223return false;
224 }
225226227DiskCacheEntry dce = null;
228if( DiskCacheUtils.isCached(url) ) {
229try {
230 dce = JetspeedDiskCache.getInstance().getEntry( url );
231if(!dce.hasExpired())
232 {
233return false;
234 }
235 addRealtimeURL( url );
236237//only update this if the URL on which it is based is newer 238//than the one on disk.239 URL sock;
240241// Determine the URL's protocol242 String protocol = url.substring(0, url.indexOf(":/"));
243244// Check if a proxy is set. If no port is set, use the default port (-1)245 String proxyHost = URLManager.getProxyHost( protocol );
246if (proxyHost != null)
247 {
248// Open the URL using a proxy249 sock = new URL(protocol,
250 proxyHost,
251 URLManager.getProxyPort( protocol ),
252 url);
253 }
254else255 {
256 sock = new URL( url );
257 }
258259 URLConnection conn = null;
260 conn = sock.openConnection();
261262 File file = dce.getFile();
263long mod = dce.getLastModified();
264long filesize = 0;
265if(file != null)
266 {
267 filesize = file.length();
268 }
269270if(mod > 0 || filesize > 0)
271 conn.setIfModifiedSince(mod);
272273 conn.connect();
274long last = conn.getLastModified();
275long expires = conn.getExpiration();
276int clength = conn.getContentLength();
277int respCode = 200;
278if(conn instanceof HttpURLConnection) {
279 respCode = ( ( HttpURLConnection )conn ).getResponseCode();
280 }
281282if (respCode != 304 /*NOT MODIFIED*/ && 283 (clength == -1 || clength > 0) &&
284 ( last == 0 ||
285 last > dce.getLastModified()) ) {
286287 logger.info( "URLFetcher: Found updated URL: " +
288 url +
289" Modified " + last + " Expires: " + expires +
290" CLength: " + clength );
291292//force this URL to update.293294 JetspeedDiskCache.getInstance().getEntry( url, getReader( conn ) );
295//Trying to deal with a problem under FreeBSD296 conn.getInputStream().close();
297298//Set the last modified and expiration times for entry299//FIXME: 0 is used in FileWatcher to mean not initialized...300if(last > 0)
301 dce.setLastModified(last);
302else303 dce.setLastModified( System.currentTimeMillis() );
304 dce.setExpirationTime(expires);
305306307//removeRealtimeURL( url ); (done in finally)308returntrue;
309//now make sure that the entry that depends on this HREF310//is updated in the PortletFactory.311 } else {
312313if(last > 0)
314 dce.setLastModified(last);
315else316 dce.setLastModified( System.currentTimeMillis() );
317 dce.setExpirationTime(expires);
318319320 logger.info( "DiskCacheDaemon: URL still valid: " + url +
321" Modified " + last + " Expires: " + expires +
322" CLength: " + clength);
323//removeRealtimeURL( url ); (done in finally)324return false;
325 }
326 } catch (Throwable e) {
327//Add as a Bad URL328 logger.error("Throwable", e);
329 URLManager.register( url,
330 URLManagerService.STATUS_BAD,
331 e.toString() );
332 } finally {
333 removeRealtimeURL( url );
334 }
335336 } else {
337 logger.info( "URLFetcher: Cache miss during validation! Forcing url: " + url );
338 removeRealtimeURL( url );
339 JetspeedDiskCache.getInstance().getEntry( url, true );
340returntrue;
341 }
342return false;
343344 }
345346347/***348 *349 * Return a Reader for a given HTTP connection.350 * If the connection first line contains a XML declaration351 * with encoding, honor this encoding.352 * If not, use the encoding from the HTTP connection,353 * taking ISO-8859-1 as default.354 *355 */356staticfinal Reader getReader( URLConnection conn )
357 throws IOException, UnsupportedEncodingException {
358 String enc = conn.getContentEncoding();
359if( enc == null ) {
360 enc = "ISO-8859-1";
361 }
362// Some XML files come with a encoding attribute inside,363// different than the HTTP encoding. We will have364// to start reading the Reader, read the attribute and rewind 365// the stream, generating a new reader with the "true" encoding366 BufferedInputStream is = new BufferedInputStream( conn.getInputStream() );
367//If document is XML, find the encoding and give it priority over368//the one returned by the connection369370//we mark for resetting later. We need a big number to ensure371// stack of streams don't read it to fill buffers.372 is.mark( 20480 );
373 BufferedReader asciiReader = new BufferedReader( new InputStreamReader( is, "ASCII" ) );
374 String decl = asciiReader.readLine();
375//System.err.println( "Line: " + decl );376 String key = "encoding=\"";
377//decl nul means that the connection got reset...378if( decl != null ) {
379int off = decl.indexOf( key );
380if( off > 0 ) {
381 enc = decl.substring( off + key.length(),
382 decl.indexOf( '"' , off + key.length()) );
383 }
384 }
385 logger.info("URLFetcher: found URL with encoding -> " + enc );
386//Reset the bytes read387 is.reset();
388 Reader rdr = new InputStreamReader( is,
389 enc );
390return rdr;
391 }
392393394395/***396 Add a URL that is downloading in realtime397 */398staticfinalvoid addRealtimeURL( String url ) {
399synchronized( realtime_urls )
400 {
401 Vector threads = (Vector) realtime_urls.get( url);
402if(threads != null)
403 {
404if(!threads.contains(Thread.currentThread()))
405 {
406 threads.addElement(Thread.currentThread() );
407 }
408 } else {
409 threads = new Vector();
410 threads.addElement(Thread.currentThread());
411 realtime_urls.put( url, threads );
412 }
413 }
414415 }
416417/***418 Remove a URL because it isn't downloading anymore.419 */420staticfinalvoid removeRealtimeURL( String url ) {
421synchronized( realtime_urls )
422 {
423 Vector threads = (Vector) realtime_urls.get( url);
424if(threads != null)
425synchronized( threads )
426 {
427 Thread realLoader = (Thread) threads.firstElement();
428if(realLoader == Thread.currentThread())
429 {
430synchronized(url.intern())
431 {
432 realtime_urls.remove(url);
433 url.intern().notifyAll();
434 }
435 } else {
436 threads.removeElement(Thread.currentThread());
437 }
438 }
439 }
440441 }
442443/***444 Return true if this URL isn't downloading in realtime.445 */446staticfinalboolean isRealtimeURL( String url ) {
447448synchronized( realtime_urls ) {
449return realtime_urls.get( url ) != null;
450 }
451452 }
453454/***455 Return the list of realtime URLs for debug456 */457publicstaticfinal Hashtable getRealtimeURLs() {
458synchronized(realtime_urls) {
459return realtime_urls;
460 }
461 }
462463 }