仿京东搜索
仿京东搜索
项目介绍:基于springboot的前后端分离项目,利用爬虫将京东首页的数据爬取下来,然后将数据放到ElasticSearch中,通过后端配置查询规则实现仿京东搜索。
功能:实现分页高亮查询
主要负责:
1、Jsoup爬取数据 。2、实现搜索数据(条件、精确、分页、高亮搜索)
项目地址:https://gitee.com/jamer/jingdong-search
依赖
<?xml version="1.0" encoding="UTF-8"?>4.0.0 org.springframework.boot spring-boot-starter-parent 2.2.5.RELEASE com.renzhe jd-search 0.0.1-SNAPSHOT jd-search Demo project for Spring Boot 1.8 7.6.1 org.jsoup jsoup 1.11.3 com.alibaba fastjson 1.2.50 org.springframework.boot spring-boot-starter-data-elasticsearch org.springframework.boot spring-boot-starter-thymeleaf org.springframework.boot spring-boot-starter-web org.springframework.boot spring-boot-devtools runtime true org.springframework.boot spring-boot-configuration-processor true org.projectlombok lombok true org.springframework.boot spring-boot-starter-test test org.springframework.boot spring-boot-maven-plugin org.projectlombok lombok
工具类utils 用来解析网页
package com.renzhe.utils; import com.renzhe.pojo.Content; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.boot.autoconfigure.condition.ConditionalOnJava; import org.springframework.stereotype.Component; import java.io.IOException; import java.net.URL; import java.util.ArrayList; import java.util.List; //爬取网页 @Component public class HtmlParseUtil { public ListparseJD(String keyword)throws IOException { //获取请求 https://search.jd.com/Search?keyword=Java //前提要联网 ajax不能获取到 String url = "https://search.jd.com/Search?keyword=" + keyword; //解析网页,(Jsoup返回的就是浏览器Document对象) Document document = Jsoup.parse(new URL(url), 30000); //所有你在js中使用的方法这里都能用 Element element = document.getElementById("J_goodsList"); //获取所有的li元素 Elements elements = element.getElementsByTag("li"); //获取元素中的内容,这里的el 就是每一个li标签 ArrayList goodsList = new ArrayList(); for (Element el : elements) { //图片爬取失败的原因 在图片资源比较多的网站 大多都是将其用懒加载实现,等页面加载完后再渲染页面 从而实现提高加载速度 //source-data-lazy-img String img = el.getElementsByTag("img").eq(0).attr("data-lazy-img"); String price = el.getElementsByClass("p-price").eq(0).text(); String title = el.getElementsByClass("p-name").eq(0).text(); Content content = new Content(); content.setImg(img); content.setPrice(price); content.setTitle(title); goodsList.add(content); } return goodsList; } }
pojo类
package com.renzhe.pojo; import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; import lombok.ToString; @Data @AllArgsConstructor @NoArgsConstructor @ToString public class Content { private String title; private String img; private String price; }
service
package com.renzhe.service; import com.alibaba.fastjson.JSON; import com.renzhe.pojo.Content; import com.renzhe.utils.HtmlParseUtil; import org.elasticsearch.action.bulk.BulkRequest; import org.elasticsearch.action.bulk.BulkResponse; import org.elasticsearch.action.index.IndexRequest; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.client.RequestOptions; import org.elasticsearch.client.RestHighLevelClient; import org.elasticsearch.common.text.Text; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.xcontent.XContentType; import org.elasticsearch.index.query.QueryBuilders; import org.elasticsearch.index.query.TermQueryBuilder; import org.elasticsearch.search.SearchHit; import org.elasticsearch.search.builder.SearchSourceBuilder; import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; import org.elasticsearch.search.fetch.subphase.highlight.HighlightField; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import javax.swing.text.Highlighter; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; //业务编写 @Service public class ContentService { @Autowired private RestHighLevelClient restHighLevelClient; //解析数据 放入es索引库中 public Boolean parseContent(String keywords) throws IOException { Listcontents = new HtmlParseUtil().parseJD(keywords); //把查询的数据放入到es BulkRequest bulkRequest = new BulkRequest(); bulkRequest.timeout("2m"); for (int i = 0; i > searchPage(String keyword,int pageNo,int pageSize) throws IOException{ if(pageNo<=1){ pageNo = 1; } //条件搜索 SearchRequest searchRequest = new SearchRequest("jd_goods"); SearchSourceBuilder sourceBuilder = new SearchSourceBuilder(); //分页 sourceBuilder.from(pageNo); sourceBuilder.size(pageSize); //精准匹配 TermQueryBuilder termQueryBuilder = QueryBuilders.termQuery("title", keyword); sourceBuilder.query(termQueryBuilder); sourceBuilder.timeout(new TimeValue(60, TimeUnit.SECONDS)); //执行搜索 searchRequest.source(sourceBuilder); SearchResponse searchResponse = restHighLevelClient.search(searchRequest, RequestOptions.DEFAULT); //解析结果 ArrayList
config
package com.renzhe.config; import org.apache.http.HttpHost; import org.elasticsearch.client.RestClient; import org.elasticsearch.client.RestHighLevelClient; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @Configuration public class ElasticSearchClientConfig { @Bean public RestHighLevelClient restHighLevelClient(){ RestHighLevelClient client = new RestHighLevelClient( RestClient.builder( new HttpHost("127.0.0.1", 9200, "http"))); return client; } }
controller
package com.renzhe.controller; import com.renzhe.service.ContentService; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Controller; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.PathVariable; import org.springframework.web.bind.annotation.RestController; import java.io.IOException; import java.util.List; import java.util.Map; //前端的请求编写 @RestController public class ContentController { @Autowired private ContentService contentService; @GetMapping("/parse/{keywords}") public Boolean parse(@PathVariable("keywords") String keywords) throws IOException { Boolean flag = contentService.parseContent(keywords); System.out.println(flag); return flag; } @GetMapping("/search/{keyword}/{pageNo}/{pageSize}") public List> search(@PathVariable("keyword") String keyword,@PathVariable("pageNo") int pageNo,@PathVariable("pageSize") int pageSize) throws Exception{ return contentService.searchPageHighlightBuilder(keyword,pageNo,pageSize); } }
application.properties
server.port=9090 #关闭thymleaf的缓存 spring.thymeleaf.cache=false