import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.zip.GZIPInputStream; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpMethod; import org.apache.commons.httpclient.methods.GetMethod; public class Test { private String getHtmlSource(String url, String encoding) throws IOException { HttpClient client = new HttpClient(); HttpMethod method = new GetMethod(); method.setRequestHeader("user-agent", "Mozilla/5.0 (Windows; U; Windows NT 5.2; zh-TW; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1"); method.setRequestHeader("accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); method.setRequestHeader("accept-language", "en-us;q=0.7,en;q=0.3"); method.setRequestHeader("accept-encoding", "gzip,deflate"); method.setRequestHeader("accept-charset", "ISO-8859-1,utf-8;q=0.7,*;q=0.7"); method.setRequestHeader("keep-alive", "300"); method.setRequestHeader("connection", "keep-alive"); InputStream is = null; GZIPInputStream gzip = null; InputStreamReader isReader = null; BufferedReader bReader = null; client.executeMethod(method); is = method.getResponseBodyAsStream(); if (method.getResponseHeader("Content-Encoding") != null && method.getResponseHeader("Content-Encoding").getValue().equalsIgnoreCase("gzip")) { // 若有壓縮 gzip = new GZIPInputStream(is); isReader = new InputStreamReader(gzip, encoding); } else { isReader = new InputStreamReader(is, encoding); } bReader = new BufferedReader(isReader); StringBuilder sb = new StringBuilder(); String line = null; while ((line = bReader.readLine()) != null) { sb.append(String.format("%s ", line)); } if (bReader != null) bReader.close(); if (isReader != null) isReader.close(); if (gzip != null) gzip.close(); if (is != null) is.close(); method.releaseConnection(); return sb.toString(); } }
2009/02/01
Get HTML source using HttpClient
HttpClient version : 3.1