基于HttpClient和JSoup的java网络爬虫

1 引言

网络爬虫（Web Crawler），又称为网络蜘蛛（Web Spider）或 Web 信息采集器，是一种按照一定规则，自动抓取或下载网络信息的计算机程序或自动化脚本。网络爬虫本质上就是通过模拟浏览器的方式获取服务器数据。

Java 网络爬虫具有很好的扩展性可伸缩性，其是目前搜索引擎开发的重要组成部分。例如，著名的网络爬虫工具Nutch便是采用 Java 开发。

2 HttpClient

HttpClient 是 Apache Jakarta Common 下的子项目，用来提供高效的、最新的、功能丰富的支持 HTTP 协议的客户端编程工具包，并且它支持 HTTP 协议最新的版本和建议。其相比于传统 JDK 自带的 URLConnection，增加了易用性和灵活性。其功能主要是用来向服务器发送请求，并返回相关资源。在网络爬虫实战中，经常使用 HttpClient 获取网页内容，使用 jsoup 解析网页内容。

2.1 首先需要引入相关依赖：


    org.apache.httpcomponents
    httpclient
    4.5.3

2.2 代码示例

import org.apache.http.NameValuePair;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
import org.junit.Test;

import java.util.ArrayList;
import java.util.List;

public class HttpClientTest {
    /**
     * 测试HttpClient发送Get请求
     */
    @Test
    public void testGet() throws Exception {
        //0.创建配置
        RequestConfig requestConfig = RequestConfig.custom()
                .setSocketTimeout(10000)//设置连接的超时时间
                .setConnectTimeout(10000)//设置创建连接的超时时间
                .setConnectionRequestTimeout(10000)//设置请求超时时间
                //.setProxy(new HttpHost("123.207.57.145",  1080, null))//添加代理
                .build();

        //1.创建HttpClient对象
        //HttpClient httpClient = new DefaultHttpClient();//不用
        //CloseableHttpClient httpClient = HttpClients.createDefault();//简单API
        CloseableHttpClient httpClient = HttpClients.custom().setDefaultRequestConfig(requestConfig).build();//常用

        //2.创建HttpGet请求
        String uri = "https://www.baidu.com?so=java";
        HttpGet httpGet = new HttpGet(uri);
        //或者单独给httpGet设置
        //httpGet.setConfig(requestConfig);

        //3.设置请求头
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36");

        CloseableHttpResponse response = null;
        try {
            //4.使用HttpClient发起请求
            response = httpClient.execute(httpGet);
            //5.判断响应状态码是否为200
            if (response.getStatusLine().getStatusCode() == 200) {
                //6.获取响应数据
                String content = EntityUtils.toString(response.getEntity(), "UTF-8");
                System.out.println(content);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            //6.关闭资源
            response.close();
            httpClient.close();
        }
    }

    /**
     * 测试HttpClient发送Post请求
     */
    @Test
    public void testPost() throws Exception {
        //1.创建HttpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        //2.创建HttpPost请求
        HttpPost httpPost = new HttpPost("https://www.baidu.com");
        //3.准备参数
        List params = new ArrayList();
        params.add(new BasicNameValuePair("so", "java"));
        UrlEncodedFormEntity formEntity = new UrlEncodedFormEntity(params, "UTF-8");
        //4.设置参数
        httpPost.setEntity(formEntity);
        //5.发起请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpPost);
            if (response.getStatusLine().getStatusCode() == 200) {
                //6.获取响应
                String content = EntityUtils.toString(response.getEntity(), "UTF-8");
                System.out.println(content);
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            //7.关闭资源
            response.close();
            httpClient.close();
        }
    }

    /**
     * 测试HttpClient连接池
     */
    @Test
    public void testPool() throws Exception {
        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
        //设置最大连接数
        cm.setMaxTotal(200);
        //设置每个主机的并发数
        cm.setDefaultMaxPerRoute(20);
        doGet(cm);
        doGet(cm);
    }

    private static void doGet(PoolingHttpClientConnectionManager cm) throws Exception {
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        //在下一行加断点
        HttpGet httpGet = new HttpGet("https://www.baidu.com");
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            if (response.getStatusLine().getStatusCode() == 200) {
                String content = EntityUtils.toString(response.getEntity(), "UTF-8");
                System.out.println(content.length());
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            //释放连接
            response.close();
            //不能关闭HttpClient
            //httpClient.close();
        }
    }
}

2.3 下面是封装好的工具类

import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.util.EntityUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

/**
 * @Desc 封装 HttpClient 工具,方便爬取网页内容
 * @Author Kerwin
 * @Date 2021/12/27 13:59
 */
public abstract class HttpUtils {
    //声明httpClient管理器对象(HttpClient连接池)
    private static PoolingHttpClientConnectionManager cm = null;
    //声明请求配置对象
    private static RequestConfig config = null;
    //声明用户代理列表，每次请求时从中随机取出一个用户代理
    private static List userAgentList = null;

    //静态代码块会在类被加载的时候执行
    static{
        cm = new PoolingHttpClientConnectionManager();
        cm.setMaxTotal(200);
        cm.setDefaultMaxPerRoute(20);
        config = RequestConfig.custom()
                .setSocketTimeout(10000)
                .setConnectTimeout(10000)
                .setConnectionRequestTimeout(10000)
//                .setProxy(new HttpHost("123.180.68.60",8060)) // 添加代理
                .build();
        userAgentList = new ArrayList();
        userAgentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36");
        userAgentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:73.0) Gecko/20100101 Firefox/73.0");
        userAgentList.add("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.5 Safari/605.1.15");
        userAgentList.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299");
        userAgentList.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36");
        userAgentList.add("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0");

    }

    public static String getHtml(String url){
       //1.从连接池中获取HttpClient对象
        CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(cm).build();
        //2.创建HttpGet对象
        HttpGet httpGet = new HttpGet(url);
        //3.设置请求配置对象和请求头
        httpGet.setConfig(config);
        httpGet.setHeader("User-Agent",userAgentList.get(new Random().nextInt(userAgentList.size())));
        //4.发起请求
        CloseableHttpResponse response = null;
        try {
            response = httpClient.execute(httpGet);
            //5.获取响应内容
            if (response.getStatusLine().getStatusCode() == 200){
                String html = "";
                if(response.getEntity()!=null){
                    html = EntityUtils.toString(response.getEntity(), "UTF-8");
                }
                return html;
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (response != null) {
                    response.close();
                }
                //httpClient.close();//注意:这里的HttpClient是从cm(连接池)中获取的,不需要关闭
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return null;
    }
    
	// 测试方法
    public static void main(String[] args) {
        String html = HttpUtils.getHtml("https://www.baidu.com");
        System.out.println(html);
    }
}

3 JSoup

我们抓取到页面之后，还需要对页面进行解析。可以使用字符串处理工具解析页面，也可以使用正则表达式，但是这些方法都会带来很大的开发成本，所以我们需要使用一款专门解析html页面的技术。

jsoup 是一款基于 Java 语言的 HTML 请求及解析器，可直接请求某个 URL 地址、解析 HTML 文本内容。它提供了一套非常省力的 API，可通过 DOM、CSS 以及类似于 jQuery 的操作方法来取出和操作数据。

jsoup的主要功能如下：

从一个URL，文件或字符串中解析HTML；
使用DOM或CSS选择器来查找、取出数据；
可操作HTML元素、属性、文本；

注意:
虽然使用Jsoup可以替代HttpClient直接发起请求解析数据，但是往往不会这样用，因为实际的开发过程中，需要使用到多线程，连接池，代理等等方式，而jsoup对这些的支持并不是很好，所以我们一般把jsoup仅仅作为Html解析工具使用

3.1 引入相关依赖


    org.jsoup
    jsoup
    1.10.3

3.2 代码示例

import org.apache.commons.io.FileUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.Test;

import java.io.File;
import java.net.URL;

public class JsopTest {
    /**
     * 测试JSoup-获取Document
     */
    @Test
    public void testDocument() throws Exception {
        Document doc1 = Jsoup.connect("https://www.baidu.com").get();
        Document doc2 = Jsoup.parse(new URL("https://www.baidu.com"), 1000);

        String html = FileUtils.readFileToString(new File("jsoup.html"), "UTF-8");
        Document doc3 = Jsoup.parse(html);

        System.out.println(doc1);
        System.out.println(doc2);
        System.out.println(doc3);
    }

    /**
     * 测试JSoup-解析html
     */
    @Test
    public void testJsoupHtml() throws Exception {
        Document doc = Jsoup.parse(new File("jsoup.html"), "UTF-8");

        //**使用dom方式遍历文档
        //1. 根据id查询元素getElementById
        Element element = doc.getElementById("city_bj");
        System.out.println(element.text());
        //2. 根据标签获取元素getElementsByTag
        element = doc.getElementsByTag("title").first();
        System.out.println(element.text());
        //3. 根据class获取元素getElementsByClass
        element = doc.getElementsByClass("s_name").last();
        System.out.println(element.text());
        //4. 根据属性获取元素getElementsByAttribute
        element = doc.getElementsByAttribute("abc").first();
        System.out.println(element.text());
        element = doc.getElementsByAttributeValue("class", "city_con").first();
        System.out.println(element.text());


        //**元素中数据获取
        //1. 从元素中获取id
        String str = element.id();
        System.out.println(str);
        //2. 从元素中获取className
        str = element.className();
        System.out.println(str);
        //3. 从元素中获取属性的值attr
        str = element.attr("id");
        System.out.println(str);
        //4. 从元素中获取所有属性attributes
        str = element.attributes().toString();
        System.out.println(str);
        //5. 从元素中获取文本内容text
        str = element.text();
        System.out.println(str);

        //**使用选择器语法查找元素
        //jsoup elements对象支持类似于CSS (或jquery)的选择器语法，来实现非常强大和灵活的查找功能。
        //select方法在Document/Element/Elements对象中都可以使用。可实现指定元素的过滤，或者链式选择访问。
        //1. tagname: 通过标签查找元素，比如：span
        Elements span = doc.select("span");
        for (Element e : span) {
            System.out.println(e.text());
        }
        //2. #id: 通过ID查找元素，比如：#city_bjj
        str = doc.select("#city_bj").text();
        System.out.println(str);
        //3. .class: 通过class名称查找元素，比如：.class_a
        str = doc.select(".class_a").text();
        System.out.println(str);
        //4. [attribute]: 利用属性查找元素，比如：[abc]
        str = doc.select("[abc]").text();
        System.out.println(str);
        //5. [attr=value]: 利用属性值来查找元素，比如：[class=s_name]
        str = doc.select("[class=s_name]").text();
        System.out.println(str);


        //**Selector选择器组合使用
        //1. el#id: 元素+ID，比如： h3#city_bj
        str = doc.select("h3#city_bj").text();
        System.out.println(str);
        //2. el.class: 元素+class，比如： li.class_a
        str = doc.select("li.class_a").text();
        System.out.println(str);
        //3. el[attr]: 元素+属性名，比如： span[abc]
        str = doc.select("span[abc]").text();
        System.out.println(str);
        //4. 任意组合，比如：span[abc].s_name
        str = doc.select("span[abc].s_name").text();
        System.out.println(str);
        //5. ancestor child: 查找某个元素下子元素，比如：.city_con li 查找"city_con"下的所有li
        str = doc.select(".city_con li").text();
        System.out.println(str);
        //6. parent > child: 查找某个父元素下的直接子元素，
        //比如：.city_con > ul > li 查找city_con第一级（直接子元素）的ul，再找所有ul下的第一级li
        str = doc.select(".city_con > ul > li").text();
        System.out.println(str);
        //7. parent > * 查找某个父元素下所有直接子元素.city_con > *
        str = doc.select(".city_con > *").text();
        System.out.println(str);
    }
}