1/*2 * Licensed to the Apache Software Foundation (ASF) under one or more3 * contributor license agreements. See the NOTICE file distributed with4 * this work for additional information regarding copyright ownership.5 * The ASF licenses this file to You under the Apache License, Version 2.06 * (the "License"); you may not use this file except in compliance with7 * the License. You may obtain a copy of the License at8 * 9 * http://www.apache.org/licenses/LICENSE-2.010 * 11 * Unless required by applicable law or agreed to in writing, software12 * distributed under the License is distributed on an "AS IS" BASIS,13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.14 * See the License for the specific language governing permissions and15 * limitations under the License.16 */17packageorg.apache.jetspeed.search.handlers;
1819// Java imports20import java.io.IOException;
21import java.net.URL;
2223import org.apache.commons.httpclient.HttpClient;
24import org.apache.commons.httpclient.HttpException;
25import org.apache.commons.httpclient.methods.GetMethod;
26import org.apache.jetspeed.search.AbstractObjectHandler;
27import org.apache.jetspeed.search.BaseParsedObject;
2829/***30 * This object handler deals with URLs.31 * 32 * @author <a href="mailto:morciuch@apache.org">Mark Orciuch</a>33 * @version $Id: URLToDocHandler.java 516448 2007-03-09 16:25:47Z ate $34 */35publicclassURLToDocHandlerextendsAbstractObjectHandler36 {
37/***38 * Static initialization of the logger for this class39 */40//private static final JetspeedLogger logger = JetspeedLogFactoryService.getLogger(URLToDocHandler.class.getName());4142/***43 * Parses a specific object into a document suitable for index placement44 * 45 * @param o46 * @return 47 */48public org.apache.jetspeed.search.ParsedObject parseObject(Object o)
49 {
50 org.apache.jetspeed.search.ParsedObject result = newBaseParsedObject();
5152if ((o instanceof URL) == false)
53 {
54//logger.error("URLToDocHandler: invalid object type: " + o);55returnnull;
56 }
5758 URL pageToAdd = (URL) o;
5960 HttpClient client = new HttpClient();
61 GetMethod method = new GetMethod(pageToAdd.toString());
62 method.setFollowRedirects(true);
63int statusCode = -1;
64int attempt = 0;
6566try67 {
68// We will retry up to 3 times.69while (statusCode == -1 && attempt < 3)
70 {
71try72 {
73// execute the method.74 client.executeMethod(method);
75 statusCode = method.getStatusCode();
76//if (logger.isDebugEnabled())77 {
78//logger.debug("URL = " + pageToAdd.toString() + "Status code = " + statusCode);79 }
80 }
81catch (HttpException e)
82 {
83// We will retry84 attempt++;
85 }
86catch (IOException e)
87 {
88returnnull;
89 }
90 }
91// Check that we didn't run out of retries.92if (statusCode != -1)
93 {
94 String content = null;
95try96 {
97 content = method.getResponseBodyAsString();
98 }
99catch (Exception ioe)
100 {
101//logger.error("Getting content for " + pageToAdd.toString(), ioe);102 }
103104if (content != null)
105 {
106try107 {
108 result.setKey(java.net.URLEncoder.encode(pageToAdd.toString(),"UTF-8"));
109 result.setType(org.apache.jetspeed.search.ParsedObject.OBJECT_TYPE_URL);
110// TODO: We should extract the <title> tag here.111 result.setTitle(pageToAdd.toString());
112 result.setContent(content);
113 result.setDescription("");
114 result.setLanguage("");
115 result.setURL(pageToAdd);
116 result.setClassName(o.getClass().getName());
117//logger.info("Parsed '" + pageToAdd.toString() + "'");118 }
119catch (Exception e)
120 {
121 e.printStackTrace();
122//logger.error("Adding document to index", e);123 }
124 }
125 }
126 }
127finally128 {
129 method.releaseConnection();
130 }
131132return result;
133134 }
135 }
136