View Javadoc

1   /*
2    * Copyright 2000-2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  package org.apache.jetspeed.services.urlmanager;
18  
19  //standard Java stuff
20  import java.io.BufferedInputStream;
21  import java.io.BufferedReader;
22  import java.io.File;
23  import java.io.InputStreamReader;
24  import java.io.IOException;
25  import java.io.Reader;
26  import java.io.UnsupportedEncodingException;
27  import java.net.HttpURLConnection;
28  import java.net.MalformedURLException;
29  import java.net.URL;
30  import java.net.URLConnection;
31  import java.util.Hashtable;
32  import java.util.Vector;
33  
34  //turbine stuff
35  import org.apache.jetspeed.services.resources.JetspeedResources;
36  
37  //jetspeed stuff
38  import org.apache.jetspeed.cache.disk.DiskCacheEntry;
39  import org.apache.jetspeed.cache.disk.DiskCacheUtils;
40  import org.apache.jetspeed.cache.disk.JetspeedDiskCache;
41  import org.apache.jetspeed.services.logging.JetspeedLogFactoryService;
42  import org.apache.jetspeed.services.logging.JetspeedLogger;
43  
44  /***
45  <p>
46  Handles fetching URLs and if for some reason anything happens add it to the
47  BadURLManager.  There are also some util methods for downloading URLs that don't
48  use the Disk Cache.
49  </p>
50  
51  
52  
53  @author <a href="mailto:burton@apache.org">Kevin A. Burton</a>
54  @author <a href="mailto:sgala@hisitech.com">Santiago Gala</a>
55  @version $Id: URLFetcher.java,v 1.14 2004/02/23 03:30:47 jford Exp $
56  */
57  public class URLFetcher 
58  {
59      /***
60       * Static initialization of the logger for this class
61       */    
62      private static final JetspeedLogger logger = JetspeedLogFactoryService.getLogger(URLFetcher.class.getName());
63      
64      /***
65      URLs that Jetspeed is currently trying to fetch in real time.
66      */
67      private static Hashtable realtime_urls = new Hashtable();
68  
69      /***
70       *
71       */
72      static final boolean shouldFetchNow = 
73          JetspeedResources.getBoolean( JetspeedResources.CACHE_REQUIRE_CACHED_KEY );
74      
75      static {
76          //Looking for redirected channels...
77          java.net.HttpURLConnection.setFollowRedirects(true);
78      }
79  
80      public static final Reader fetch( String url ) throws IOException {
81          return fetch ( url, false );
82      }
83  
84      
85      /***
86      Try and fetch a URL as and get the content as a String and possibly add
87      the URL to the BadURLManager if anything goes wrong.
88      
89      @param url The URL to fetch
90      @param force if set to true then do not use force this entry to be in the cache...
91                   IE do not use CACHE_REQUIRE_CACHED
92      */
93      public static final Reader fetch( String url,
94                                             boolean force ) throws IOException {
95  
96          if ( ! URLManager.isOK( url ) ) {
97              throw new URLNotAvailableException( url );
98          }
99  
100         //SGP
101         if( force == false && DiskCacheUtils.isCached( url ) == true) 
102         {
103             logger.info( "The url " + 
104                       url + 
105                       " is fetched from the Cache" );
106             return JetspeedDiskCache.getInstance().getEntry( url ).getReader();
107         }
108         
109         //do cache required checking
110         if ( shouldFetchNow && 
111              DiskCacheUtils.isCached( url ) == false && 
112              isRealtimeURL( url ) == false &&
113              force == false ) {
114 
115             logger.info( "The url " + 
116                       url + 
117                       " is not in the cache and will be fetched now because you have configured -> " + 
118                       JetspeedResources.CACHE_REQUIRE_CACHED_KEY );
119                  
120             //it is possible that two thread request the same URL.
121             //The refresh call in JetspeedDiskCache takes care of this.
122             JetspeedDiskCache.getInstance().refresh( url );
123             
124             //thow an Exception that this isn't in the cache.
125             throw new ContentNotAvailableException( url );
126         }
127 
128         if( isRealtimeURL( url ) == true ) {
129             addRealtimeURL( url );
130           synchronized(url.intern())
131           {
132              try
133              {
134                //We wait for other thread to load
135                 url.intern().wait();
136              } catch (InterruptedException e)
137              {
138                logger.info("Wait Interrupted");
139              } finally
140              {
141                   removeRealtimeURL( url );
142                }
143           }
144             // We try again
145           return URLFetcher.fetch( url, force );
146         } else {
147             addRealtimeURL( url );
148         }
149         try {
150             
151             URL content;
152 
153 	    // Determine the URL's protocol
154             String protocol = url.substring(0, url.indexOf(":/"));
155 	    
156 	    // Check if a proxy is set. If no port is set, use the default port (-1)
157             String proxyHost = URLManager.getProxyHost( protocol );
158             if (proxyHost != null)
159             {
160                 // Open the URL using a proxy
161                 content = new URL(protocol,
162                                   proxyHost,
163                                   URLManager.getProxyPort( protocol ),
164                                   url);
165             }
166             else
167             {
168                 content = new URL( url );
169             }
170 
171             URLConnection conn = content.openConnection();
172             return getReader( conn );
173             
174         } catch ( Throwable t ) {
175             
176             String reason = "";
177             
178             if ( t instanceof MalformedURLException ) {
179                 reason = "The URL is Malformed.";
180             } else {
181                 reason = t.toString();
182             }
183             
184             //if the URL couldn't be fetched because it is remote AND
185             //it is not in the cache, add it to the bad URL list.
186             if ( DiskCacheUtils.isCached( url ) == false ) {
187                 //Reported up there...
188                 //logger.error( t );
189                 URLManager.register( url, URLManagerService.STATUS_BAD, reason );
190             } else {
191             //it is in the cache, remove it (could be broken in cache).
192             //next time we could be luckier.
193                 JetspeedDiskCache.getInstance().remove(url);
194             }
195 
196 
197             throw new URLNotAvailableException( reason, url );
198 
199         } finally {
200             removeRealtimeURL( url );
201         }
202 
203     }
204 
205 
206     /***
207     Try and fetch a URL if the copy in the cache has expired and add
208     the URL to the BadURLManager if anything goes wrong.
209     
210     @param url The URL to fetch
211     @param force if set to true then do not use force this entry to be in the cache...
212                  IE do not use CACHE_REQUIRE_CACHED
213     */
214     public static final boolean refresh( String url) throws IOException {
215         
216         if ( ! URLManager.isOK( url ) ) {
217             if( DiskCacheUtils.isCached(url) ) 
218                 JetspeedDiskCache.getInstance().remove(url);
219             throw new URLNotAvailableException( url );
220         }
221         
222         if(isRealtimeURL(url)) {
223             return false;
224         }
225 
226             
227          DiskCacheEntry dce = null;
228          if( DiskCacheUtils.isCached(url) ) {
229              try {
230                  dce = JetspeedDiskCache.getInstance().getEntry( url );
231                  if(!dce.hasExpired())
232                  {
233                          return false;
234                  }
235                  addRealtimeURL( url );
236 
237                  //only update this if the URL on which it is based is newer 
238                  //than the one on disk.
239                  URL sock;
240                    
241                  // Determine the URL's protocol
242                  String protocol = url.substring(0, url.indexOf(":/"));
243            
244                  // Check if a proxy is set. If no port is set, use the default port (-1)
245                  String proxyHost = URLManager.getProxyHost( protocol );
246                  if (proxyHost != null)
247                  {
248                      // Open the URL using a proxy
249                      sock = new URL(protocol,
250                                     proxyHost,
251                                     URLManager.getProxyPort( protocol ),
252                                     url);
253                  }
254                  else
255                  {	
256                      sock = new URL( url );
257                  }
258 
259                  URLConnection conn = null;
260                  conn = sock.openConnection();
261 
262                  File file = dce.getFile();
263                  long mod = dce.getLastModified();
264                  long filesize = 0;
265                  if(file != null)
266                  {
267                      filesize = file.length();
268                  }
269 
270                  if(mod > 0 || filesize > 0)
271                      conn.setIfModifiedSince(mod);
272                    
273                  conn.connect();
274                  long last = conn.getLastModified();
275                  long expires = conn.getExpiration();
276                  int clength = conn.getContentLength();
277                  int respCode = 200;
278                  if(conn instanceof HttpURLConnection) {
279                      respCode = ( ( HttpURLConnection )conn ).getResponseCode();
280                  }
281                    
282                  if (respCode != 304 /*NOT MODIFIED*/ && 
283                      (clength == -1 || clength > 0) && 
284                      (  last == 0 || 
285                        last > dce.getLastModified()) ) {
286 
287                      logger.info( "URLFetcher: Found updated URL: " + 
288                                url +
289                                " Modified " + last + " Expires: " + expires +
290                                " CLength: " + clength ); 
291                 
292                      //force this URL to update.
293 
294                      JetspeedDiskCache.getInstance().getEntry( url, getReader( conn ) );
295                      //Trying to deal with a problem under FreeBSD
296                      conn.getInputStream().close();
297 
298                      //Set the last modified and expiration times for entry
299                      //FIXME: 0 is used in FileWatcher to mean not initialized...
300                      if(last > 0)
301                          dce.setLastModified(last);    
302                      else
303                          dce.setLastModified( System.currentTimeMillis() );
304                      dce.setExpirationTime(expires);
305 
306 
307                      //removeRealtimeURL( url ); (done in finally)
308                      return true;
309                      //now make sure that the entry that depends on this HREF
310                      //is updated in the PortletFactory.
311                  } else {
312 
313                      if(last > 0)
314                          dce.setLastModified(last);    
315                      else
316                          dce.setLastModified( System.currentTimeMillis() );
317                      dce.setExpirationTime(expires);
318                            
319                        
320                      logger.info( "DiskCacheDaemon: URL still valid: " + url +
321                                " Modified " + last + " Expires: " + expires +
322                                " CLength: " + clength); 
323                      //removeRealtimeURL( url ); (done in finally)
324                      return false;
325                  }
326              } catch (Throwable e) {
327                  //Add as a Bad URL
328                  logger.error("Throwable",  e);
329                  URLManager.register( url,
330                                       URLManagerService.STATUS_BAD,
331                                       e.toString() );
332              } finally {
333                  removeRealtimeURL( url );
334              }
335                    
336          } else {
337              logger.info( "URLFetcher: Cache miss during validation! Forcing url: " + url ); 
338              removeRealtimeURL( url );
339              JetspeedDiskCache.getInstance().getEntry( url, true );
340              return true;
341          }
342            return false;
343                     
344     }
345 
346 
347     /***
348      *
349      * Return a Reader for a given HTTP connection.
350      * If the connection first line contains a XML declaration
351      * with encoding, honor this encoding.
352      * If not, use the encoding from the HTTP connection,
353      * taking ISO-8859-1 as default.
354      *
355     */
356     static final Reader getReader( URLConnection conn )
357         throws IOException, UnsupportedEncodingException {
358         String enc = conn.getContentEncoding();
359         if( enc == null ) {
360             enc = "ISO-8859-1";
361         }
362         // Some XML files come with a encoding attribute inside,
363         // different than the HTTP encoding. We will have
364         // to start reading the Reader, read the attribute and rewind 
365         // the stream, generating a new reader with the "true" encoding
366         BufferedInputStream is = new BufferedInputStream( conn.getInputStream() );
367         //If document is XML, find the encoding and give it priority over
368         //the one returned by the connection
369 
370         //we mark for resetting later. We need a big number to ensure
371         // stack of streams don't read it to fill buffers.
372         is.mark( 20480 );
373         BufferedReader asciiReader = new BufferedReader( new InputStreamReader( is, "ASCII" ) );
374         String decl = asciiReader.readLine();
375         //System.err.println( "Line: " + decl );
376         String key = "encoding=\"";
377         //decl nul means that the connection got reset...
378         if( decl != null ) {
379             int off = decl.indexOf( key );
380             if( off > 0 ) {
381                 enc = decl.substring( off + key.length(), 
382                                       decl.indexOf( '"' , off + key.length()) );
383             }
384         }
385         logger.info("URLFetcher: found URL with encoding -> " + enc );
386         //Reset the bytes read
387         is.reset();
388         Reader rdr = new InputStreamReader( is,
389                                             enc );
390         return rdr;
391     }
392 
393 
394     
395     /***
396     Add a URL that is downloading in realtime
397     */
398     static final void addRealtimeURL( String url ) {
399         synchronized( realtime_urls )
400         {
401             Vector threads = (Vector) realtime_urls.get( url);
402             if(threads != null)
403                {
404                 if(!threads.contains(Thread.currentThread()))
405                    {
406                      threads.addElement(Thread.currentThread() );
407                    }
408                } else {
409                 threads = new Vector();
410                 threads.addElement(Thread.currentThread());
411                 realtime_urls.put( url, threads  );
412                }
413         }
414         
415     }
416     
417     /***
418     Remove a URL because it isn't downloading anymore.
419     */
420     static final void removeRealtimeURL( String url ) {
421         synchronized( realtime_urls )
422         {
423            Vector threads = (Vector) realtime_urls.get( url);
424            if(threads != null)
425                synchronized( threads  )
426                    {
427                     Thread realLoader = (Thread) threads.firstElement();
428                     if(realLoader == Thread.currentThread())
429                     {
430                       synchronized(url.intern())
431                      {
432                       realtime_urls.remove(url);
433                       url.intern().notifyAll();
434                       }          
435                      } else {
436                      threads.removeElement(Thread.currentThread());
437                      }
438                     }
439         }
440         
441     }
442 
443     /***
444     Return true if this URL isn't downloading in realtime.
445     */
446     static final boolean isRealtimeURL( String url ) {
447 
448         synchronized( realtime_urls ) {
449             return realtime_urls.get( url ) != null;
450         }
451             
452     }
453 
454     /***
455     Return the list of realtime URLs for debug
456     */
457     public static final Hashtable getRealtimeURLs() {
458         synchronized(realtime_urls) {
459             return realtime_urls;
460         }
461     }
462     
463 }