- 论坛徽章:
- 0
|
HttpClient根据jsoup解析网页
Java代码- 1.package jsoup;
- 2.
- 3.import org.apache.http.HttpEntity;
- 4.import org.apache.http.HttpResponse;
- 5.import org.apache.http.HttpStatus;
- 6.import org.apache.http.client.HttpClient;
- 7.import org.apache.http.client.methods.HttpGet;
- 8.import org.apache.http.impl.client.DefaultHttpClient;
- 9.import org.apache.http.util.EntityUtils;
- 10.import org.jsoup.Jsoup;
- 11.import org.jsoup.nodes.Document;
- 12.import org.jsoup.nodes.Element;
- 13.import org.jsoup.select.Elements;
- 14.
- 15./**
- 16. * 利用HttpClient获取html代码,然后使用jsoup对html代码进行解析
- 17. * @author Administrator
- 18. *
- 19. */
- 20.public class JustTest {
- 21. public static void main(String[] args) {
- 22. String html = getHtmlByUrl("http://www.iteye.com/");
- 23. if (html != null && !"".equals(html)) {
- 24. Document doc = Jsoup.parse(html);
- 25. Elements linksElements = doc
- 26. .select("div#page>div#content>div#main>div.left>div#recommend>ul>li>a");
- 27. // 以上代码的意思是 找id为“page”的div里面 id为“content”的div里面 id为“main”的div里面
- 28. // class为“left”的div里面 id为“recommend”的div里面ul里面li里面a标签
- 29. for (Element ele : linksElements) {
- 30. String href = ele.attr("href");
- 31. String title = ele.text();
- 32. System.out.println(href + "," + title);
- 33. }
- 34. }
- 35. }
- 36.
- 37. /**
- 38. * 根据URL获得所有的html信息
- 39. *
- 40. * @param url
- 41. * @return
- 42. */
- 43. public static String getHtmlByUrl(String url) {
- 44. String html = null;
- 45. HttpClient httpClient = new DefaultHttpClient();// 创建httpClient对象
- 46. HttpGet httpget = new HttpGet(url);// 以get方式请求该URL
- 47. try {
- 48. HttpResponse responce = httpClient.execute(httpget);// 得到responce对象
- 49. int resStatu = responce.getStatusLine().getStatusCode();// 返回码
- 50. if (resStatu == HttpStatus.SC_OK) {// 200正常 其他就不对
- 51. // 获得相应实体
- 52. HttpEntity entity = responce.getEntity();
- 53. if (entity != null) {
- 54. html = EntityUtils.toString(entity);// 获得html源代码
- 55. System.out.println(html);
- 56. }
- 57. }
- 58. } catch (Exception e) {
- 59. System.out.println("访问【" + url + "】出现异常!");
- 60. e.printStackTrace();
- 61. } finally {
- 62. httpClient.getConnectionManager().shutdown();
- 63. }
- 64. return html;
- 65. }
- 66.}
复制代码 |
|