仿京东搜索


仿京东搜索

项目介绍:基于springboot的前后端分离项目,利用爬虫将京东首页的数据爬取下来,然后将数据放到ElasticSearch中,通过后端配置查询规则实现仿京东搜索。
功能:实现分页高亮查询
主要负责:
1、Jsoup爬取数据 。2、实现搜索数据(条件、精确、分页、高亮搜索)
项目地址:https://gitee.com/jamer/jingdong-search

依赖

<?xml version="1.0" encoding="UTF-8"?>

    4.0.0
    
        org.springframework.boot
        spring-boot-starter-parent
        2.2.5.RELEASE
         
    
    com.renzhe
    jd-search
    0.0.1-SNAPSHOT
    jd-search
    Demo project for Spring Boot

    
        1.8
        7.6.1
    


    
        
        
        
            org.jsoup
            jsoup
            1.11.3
        
        
            com.alibaba
            fastjson
            1.2.50
        
        
            org.springframework.boot
            spring-boot-starter-data-elasticsearch
        
        
            org.springframework.boot
            spring-boot-starter-thymeleaf
        
        
            org.springframework.boot
            spring-boot-starter-web
        

        
            org.springframework.boot
            spring-boot-devtools
            runtime
            true
        
        
            org.springframework.boot
            spring-boot-configuration-processor
            true
        
        
            org.projectlombok
            lombok
            true
        
        
            org.springframework.boot
            spring-boot-starter-test
            test
        
    

    
        
            
                org.springframework.boot
                spring-boot-maven-plugin
                
                    
                        
                            org.projectlombok
                            lombok
                        
                    
                
            
        
    


工具类utils 用来解析网页 

package com.renzhe.utils;

import com.renzhe.pojo.Content;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.boot.autoconfigure.condition.ConditionalOnJava;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

//爬取网页
@Component
public class HtmlParseUtil {


    public List parseJD(String keyword)throws IOException {
        //获取请求  https://search.jd.com/Search?keyword=Java
        //前提要联网 ajax不能获取到
        String url = "https://search.jd.com/Search?keyword=" + keyword;
        //解析网页,(Jsoup返回的就是浏览器Document对象)
        Document document = Jsoup.parse(new URL(url), 30000);
        //所有你在js中使用的方法这里都能用
        Element element = document.getElementById("J_goodsList");

        //获取所有的li元素
        Elements elements = element.getElementsByTag("li");
        //获取元素中的内容,这里的el 就是每一个li标签
        ArrayList goodsList = new ArrayList();
        for (Element el : elements) {
            //图片爬取失败的原因 在图片资源比较多的网站 大多都是将其用懒加载实现,等页面加载完后再渲染页面 从而实现提高加载速度
            //source-data-lazy-img
            String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img");
            String price = el.getElementsByClass("p-price").eq(0).text();
            String title = el.getElementsByClass("p-name").eq(0).text();

            Content content = new Content();
            content.setImg(img);
            content.setPrice(price);
            content.setTitle(title);

           goodsList.add(content);
        }
        return goodsList;
    }
}

pojo类  

package com.renzhe.pojo;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.ToString;

@Data
@AllArgsConstructor
@NoArgsConstructor
@ToString
public class Content {
    private String title;
    private String img;
    private String price;

} 

service

package com.renzhe.service;

import com.alibaba.fastjson.JSON;
import com.renzhe.pojo.Content;
import com.renzhe.utils.HtmlParseUtil;
import org.elasticsearch.action.bulk.BulkRequest;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.text.Text;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.TermQueryBuilder;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;

import javax.swing.text.Highlighter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

//业务编写
@Service
public class ContentService {


    @Autowired
    private RestHighLevelClient restHighLevelClient;
    //解析数据 放入es索引库中
    public Boolean parseContent(String keywords) throws IOException {
        List contents = new HtmlParseUtil().parseJD(keywords);
        //把查询的数据放入到es
        BulkRequest bulkRequest = new BulkRequest();
        bulkRequest.timeout("2m");
        for (int i = 0; i > searchPage(String keyword,int pageNo,int pageSize) throws IOException{
        if(pageNo<=1){
            pageNo = 1;
        }
        //条件搜索
        SearchRequest searchRequest = new SearchRequest("jd_goods");
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
        //分页
        sourceBuilder.from(pageNo);
        sourceBuilder.size(pageSize);
        //精准匹配
        TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);
        sourceBuilder.query(termQueryBuilder);
        sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));
        //执行搜索
        searchRequest.source(sourceBuilder);
        SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);


        //解析结果
        ArrayList> list = new ArrayList<>();
        for (SearchHit documentFields : searchResponse.getHits().getHits()) {
            list.add(documentFields.getSourceAsMap());
        }
        return list;

    }
    //3、获取这些数据实现搜索高亮功能
    public List> searchPageHighlightBuilder(String keyword,int pageNo,int pageSize) throws IOException{
        if(pageNo<=1){
            pageNo = 1;
        }
        //条件搜索
        SearchRequest searchRequest = new SearchRequest("jd_goods");
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
        //分页
        sourceBuilder.from(pageNo);
        sourceBuilder.size(pageSize);
        //精准匹配
        TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword);
        sourceBuilder.query(termQueryBuilder);
        sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS));

        //高亮
        HighlightBuilder highlightBuilder = new HighlightBuilder();
        highlightBuilder.field("title");
        highlightBuilder.requireFieldMatch(false);
        highlightBuilder.preTags("");
        highlightBuilder.postTags("");
        sourceBuilder.highlighter(highlightBuilder);


        //执行搜索
        searchRequest.source(sourceBuilder);
        SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT);


        //解析结果
        ArrayList> list = new ArrayList<>();
        for (SearchHit documentFields : searchResponse.getHits().getHits()) {
            //解析高亮的字段

            Map highlightFields =
                    documentFields.getHighlightFields();
            HighlightField title = highlightFields.get("title");
            Map sourceAsMap = documentFields.getSourceAsMap();//原来的结果
            if(title!=null){
                Text[] fragments = title.fragments();
                String newTitle = " ";
                for (Text text : fragments) {
                    newTitle += text;
                }
                sourceAsMap.put("title",newTitle);//使用高亮的资源替换掉原来的内容
            }


            list.add(sourceAsMap);
        }
        return list;

    }

}

config

package com.renzhe.config;

import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

@Configuration
public class ElasticSearchClientConfig {

    @Bean
    public RestHighLevelClient restHighLevelClient(){
        RestHighLevelClient client = new RestHighLevelClient(
                RestClient.builder(
                        new HttpHost("127.0.0.1", 9200, "http")));
                return client;
    }
}

controller

package com.renzhe.controller;

import com.renzhe.service.ContentService;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.RestController;

import java.io.IOException;
import java.util.List;
import java.util.Map;

//前端的请求编写
@RestController
public class ContentController {

    @Autowired
    private ContentService contentService;
    @GetMapping("/parse/{keywords}")
    public Boolean parse(@PathVariable("keywords") String keywords) throws IOException {
        Boolean flag = contentService.parseContent(keywords);
        System.out.println(flag);
        return flag;

    }


     @GetMapping("/search/{keyword}/{pageNo}/{pageSize}")
    public List> search(@PathVariable("keyword") String keyword,@PathVariable("pageNo") int pageNo,@PathVariable("pageSize") int pageSize) throws Exception{
       return contentService.searchPageHighlightBuilder(keyword,pageNo,pageSize);
    }
}

application.properties

server.port=9090
#关闭thymleaf的缓存
spring.thymeleaf.cache=false

相关