1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17 package org.apache.jetspeed.search.handlers;
18
19 // Java imports
20 import java.io.IOException;
21 import java.net.URL;
22
23 import org.apache.commons.httpclient.HttpClient;
24 import org.apache.commons.httpclient.HttpException;
25 import org.apache.commons.httpclient.methods.GetMethod;
26 import org.apache.jetspeed.search.AbstractObjectHandler;
27 import org.apache.jetspeed.search.BaseParsedObject;
28
29 /***
30 * This object handler deals with URLs.
31 *
32 * @author <a href="mailto:morciuch@apache.org">Mark Orciuch</a>
33 * @version $Id: URLToDocHandler.java 516448 2007-03-09 16:25:47Z ate $
34 */
35 public class URLToDocHandler extends AbstractObjectHandler
36 {
37 /***
38 * Static initialization of the logger for this class
39 */
40 //private static final JetspeedLogger logger = JetspeedLogFactoryService.getLogger(URLToDocHandler.class.getName());
41
42 /***
43 * Parses a specific object into a document suitable for index placement
44 *
45 * @param o
46 * @return
47 */
48 public org.apache.jetspeed.search.ParsedObject parseObject(Object o)
49 {
50 org.apache.jetspeed.search.ParsedObject result = new BaseParsedObject();
51
52 if ((o instanceof URL) == false)
53 {
54 //logger.error("URLToDocHandler: invalid object type: " + o);
55 return null;
56 }
57
58 URL pageToAdd = (URL) o;
59
60 HttpClient client = new HttpClient();
61 GetMethod method = new GetMethod(pageToAdd.toString());
62 method.setFollowRedirects(true);
63 int statusCode = -1;
64 int attempt = 0;
65
66 try
67 {
68 // We will retry up to 3 times.
69 while (statusCode == -1 && attempt < 3)
70 {
71 try
72 {
73 // execute the method.
74 client.executeMethod(method);
75 statusCode = method.getStatusCode();
76 //if (logger.isDebugEnabled())
77 {
78 //logger.debug("URL = " + pageToAdd.toString() + "Status code = " + statusCode);
79 }
80 }
81 catch (HttpException e)
82 {
83 // We will retry
84 attempt++;
85 }
86 catch (IOException e)
87 {
88 return null;
89 }
90 }
91 // Check that we didn't run out of retries.
92 if (statusCode != -1)
93 {
94 String content = null;
95 try
96 {
97 content = method.getResponseBodyAsString();
98 }
99 catch (Exception ioe)
100 {
101 //logger.error("Getting content for " + pageToAdd.toString(), ioe);
102 }
103
104 if (content != null)
105 {
106 try
107 {
108 result.setKey(java.net.URLEncoder.encode(pageToAdd.toString(),"UTF-8"));
109 result.setType(org.apache.jetspeed.search.ParsedObject.OBJECT_TYPE_URL);
110 // TODO: We should extract the <title> tag here.
111 result.setTitle(pageToAdd.toString());
112 result.setContent(content);
113 result.setDescription("");
114 result.setLanguage("");
115 result.setURL(pageToAdd);
116 result.setClassName(o.getClass().getName());
117 //logger.info("Parsed '" + pageToAdd.toString() + "'");
118 }
119 catch (Exception e)
120 {
121 e.printStackTrace();
122 //logger.error("Adding document to index", e);
123 }
124 }
125 }
126 }
127 finally
128 {
129 method.releaseConnection();
130 }
131
132 return result;
133
134 }
135 }
136