View Javadoc

1   /*
2    * Copyright 2000-2004 The Apache Software Foundation.
3    * 
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    * 
8    *      http://www.apache.org/licenses/LICENSE-2.0
9    * 
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package org.apache.jetspeed.services.search.handlers;
17  
18  // Java imports
19  import java.io.IOException;
20  import java.net.URL;
21  
22  // Commons HTTPClient
23  import org.apache.commons.httpclient.HttpClient;
24  import org.apache.commons.httpclient.HttpException;
25  import org.apache.commons.httpclient.methods.GetMethod;
26  
27  // Jetspeed imports
28  import org.apache.jetspeed.services.logging.JetspeedLogFactoryService;
29  import org.apache.jetspeed.services.logging.JetspeedLogger;
30  import org.apache.jetspeed.services.search.AbstractObjectHandler;
31  import org.apache.jetspeed.services.search.BaseParsedObject;
32  import org.apache.jetspeed.services.search.ParsedObject;
33  
34  /***
35   * This object handler deals with URLs.
36   * 
37   * @author <a href="mailto:morciuch@apache.org">Mark Orciuch</a>
38   * @version $Id: URLToDocHandler.java,v 1.4 2004/02/23 03:47:46 jford Exp $
39   */
40  public class URLToDocHandler extends AbstractObjectHandler
41  {
42      /***
43       * Static initialization of the logger for this class
44       */    
45      private static final JetspeedLogger logger = JetspeedLogFactoryService.getLogger(URLToDocHandler.class.getName());
46      
47      /***
48       * Parses a specific object into a document suitable for index placement
49       * 
50       * @param o
51       * @return 
52       */
53      public ParsedObject parseObject(Object o)
54      {
55          ParsedObject result = new BaseParsedObject();
56  
57          if ((o instanceof URL) == false)
58          {
59              logger.error("URLToDocHandler: invalid object type: " + o);
60              return null;
61          }
62  
63          URL pageToAdd = (URL) o;
64  
65          HttpClient client = new HttpClient();
66          client.startSession(pageToAdd);
67          GetMethod method = new GetMethod(pageToAdd.getPath());
68          method.setFollowRedirects(true);
69          int statusCode = -1;
70          int attempt = 0;
71  
72          // We will retry up to 3 times.
73          while (statusCode == -1 && attempt < 3)
74          {
75              try
76              {
77                  // execute the method.
78                  client.executeMethod(method);
79                  statusCode = method.getStatusCode();
80                  if (logger.isDebugEnabled())
81                  {
82                      logger.debug("URL = " + pageToAdd.toString() + "Status code = " + statusCode);
83                  }
84              }
85              catch (HttpException e)
86              {
87                  // We will retry
88              }
89              catch (IOException e)
90              {
91                  return null;
92              }
93          }
94          // Check that we didn't run out of retries.
95          if (statusCode != -1)
96          {
97              String content = null;
98              try
99              {
100                 content = method.getDataAsString();
101             }
102             catch (IOException ioe)
103             {
104                 logger.error("Getting content for " + pageToAdd.toString(), ioe);
105             }
106 
107             if (content != null)
108             {
109                 try
110                 {
111                     result.setKey(java.net.URLEncoder.encode(pageToAdd.toString()));
112                     result.setType(ParsedObject.OBJECT_TYPE_URL);
113                     // TODO: We should extract the <title> tag here.
114                     result.setTitle(pageToAdd.toString());
115                     result.setContent(content);
116                     result.setDescription("");
117                     result.setLanguage("");
118                     result.setURL(pageToAdd);
119                     result.setClassName(o.getClass().getName());
120                     logger.info("Parsed '" + pageToAdd.toString() + "'");
121                 }
122                 catch (Exception e)
123                 {
124                     e.printStackTrace();
125                     logger.error("Adding document to index", e);
126                 }
127             }
128         }
129         try
130         {
131             client.endSession();
132         }
133         catch (IOException ioe)
134         {
135             ioe.printStackTrace();
136             logger.error("Ending session to " + pageToAdd.toString(), ioe);
137         }
138 
139         return result;
140 
141     }
142 }
143