添加链接
link之家
链接快照平台
  • 输入网页链接,自动生成快照
  • 标签化管理网页链接

应用背景:
对存储在MinIO服务器的文件实现全文检索。也可以是其他服务器或本地文件,本文仅详细介绍MinIO文件的读取及转换。通过Elasticsearch的Ingest-Attachment插件抽取文件内容,支持Word、Excel、PDF、TXT等格式文件,无需手动解析文件内容。

上代码,详细解释可以阅读注释、

1.引入依赖

springboot已经管理好了依赖,只需引入spring-boot-starter-data-elasticsearch

<dependency>
    <groupId>org.springframework.boot</groupId>
    <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>

2.配置文件

elasticsearch:
  host: 192.168.2.154
  port: 9200

3.配置类

@Setter
@ConfigurationProperties(prefix = "elasticsearch")
@Configuration
public class ElasticSearchConfig {
    private String host;
    private Integer port;
    @Bean
    public RestHighLevelClient restHighLevelClient(){
        RestHighLevelClient client = new RestHighLevelClient(RestClient.builder(new HttpHost(this.host, this.port)));
        return client;

4.实现类

package com.dmp.document.service.impl;
import com.alibaba.fastjson2.JSONObject;
import com.dmp.common.constant.HttpStatus;
import com.dmp.common.core.page.PageDomain;
import com.dmp.common.core.page.TableDataInfo;
import com.dmp.common.core.page.TableSupport;
import com.dmp.document.domain.dto.DocElasticsearchDto;
import com.dmp.document.domain.entity.DocDocument;
import com.dmp.document.service.DocDocumentService;
import com.dmp.document.service.ElasticsearchService;
import com.dmp.document.service.MinioClientService;
import com.dmp.system.service.ISysConfigService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.compress.utils.IOUtils;
import org.elasticsearch.action.delete.DeleteRequest;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.elasticsearch.common.text.Text;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;
import static com.sun.webkit.network.URLs.newURL;
import static org.elasticsearch.client.RequestOptions.DEFAULT;
 * @author daixin
 * @version 1.0
 * @description: TODO
 * @date 2022/11/23 17:38
@Slf4j
@Service
public class ElasticsearchServiceImpl implements ElasticsearchService {
    @Autowired
    private RestHighLevelClient esClient;
    @Autowired
    private MinioClientService minioClientService;
    @Autowired
    private DocDocumentService docDocumentService;
    @Autowired
    private ISysConfigService sysConfigService;
    public String createFileIndex(String id, String projectId) throws Exception {
        String result = null;
        InputStream is = null;
            Date date1 = new Date();
            //查询系统内存储的文件key
            DocDocument docDocument = docDocumentService.getById(id);
            String path = docDocument.getPath();
            //获取minio下载签名
            String url = minioClientService.getDownloadLink("file-bucket",path);
            //请求minio获取文件流
            URL url2= newURL(url);
            HttpURLConnection conn=(HttpURLConnection) url2.openConnection();
            conn.setDoInput(true);
            conn.connect();
            is = conn.getInputStream();
            //转码base64
            byte[] fileByteStream = IOUtils.toByteArray(is);
            String base64String = new String(Base64.getEncoder().encodeToString(fileByteStream).getBytes(), "UTF-8");
            //封装ES请求
            IndexRequest request;
            Map attachmentMap = new HashMap();
            attachmentMap.put("data", base64String);
            attachmentMap.put("fileName", docDocument.getName());
            attachmentMap.put("projectId",projectId);
            //查询系统参数
            String esIndex = sysConfigService.selectConfigByKey("es_index");
            String esPipe = sysConfigService.selectConfigByKey("es_pipe");
            //配置查询请求参数
            request = new IndexRequest(esIndex);
            request.id(String.valueOf(docDocument.getId()));
            request.setPipeline(esPipe);//文件抽取管道,需提前创建
            request.source(JSONObject.toJSONString(attachmentMap), XContentType.JSON);
            IndexResponse response = esClient.index(request, RequestOptions.DEFAULT);
            response.status().toString();
            Date date2 = new Date();
            log.info("创建索引-----耗时:{}ms" , (date2.getTime() - date1.getTime()));
        }catch(Exception e){
            throw e;
        }finally {
            is.close();
        return result;
    @Override
    public TableDataInfo matchContent(String content, String projectId) {
        //此处为若依框架提供的分页,可改为你自己的分页
        PageDomain pageDomain = TableSupport.buildPageRequest();
        Integer pageNum = pageDomain.getPageNum();
        Integer pageSize = pageDomain.getPageSize();
        TableDataInfo rspData = new TableDataInfo();
        rspData.setCode(HttpStatus.SUCCESS);
        rspData.setMsg("查询成功");
        //查询系统参数
        String esIndex = sysConfigService.selectConfigByKey("es_index");
        SearchRequest searchRequest = new SearchRequest(esIndex);
        //布尔查询,检索标题和内容,过滤项目id
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
        MultiMatchQueryBuilder matchQueryBuilder = QueryBuilders.multiMatchQuery(content, "attachment.content","fileName");
        BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery();
        boolQueryBuilder.must(matchQueryBuilder);
        if(projectId != null){
            boolQueryBuilder.filter(QueryBuilders.termQuery("projectId", projectId));
        sourceBuilder.query(boolQueryBuilder);
        //配置高亮
        HighlightBuilder highlightBuilder = new HighlightBuilder();
        highlightBuilder.field("attachment.content"); //content字段高亮
        highlightBuilder.field("fileName");//fileName字段高亮
        highlightBuilder.preTags("<span style='color:red'>"); //高亮前缀
        highlightBuilder.postTags("</span>"); //高亮后缀
        sourceBuilder.highlighter(highlightBuilder);
        //分页查询
        sourceBuilder.from((pageNum-1)*pageSize).size(pageSize);
        searchRequest.source(sourceBuilder);
        SearchResponse searchResponse = null;
        try {
            searchResponse = esClient.search(searchRequest, DEFAULT);
        } catch (Throwable e) {
            //捕捉最高级别异常,确保打印详细信息
            e.printStackTrace();
        if(searchResponse.getHits() == null){
            rspData.setTotal(0);
            rspData.setRows(null);
            return rspData;
        List<DocElasticsearchDto> docElasticsearchList = new ArrayList<>();
        Long totalHits = searchResponse.getHits().getTotalHits().value;//匹配总条数,用于分页显示
        for (SearchHit hit : searchResponse.getHits()){
            //查询结果
            String source = hit.getSourceAsString();
            DocElasticsearchDto docElasticsearchDto = JSONObject.parseObject(source, DocElasticsearchDto.class);
            docElasticsearchDto.setId(hit.getId());
            //处理高亮字段
            Map<String, HighlightField> map = hit.getHighlightFields();
            if(map.containsKey("attachment.content")) {
                StringBuilder matchContent = new StringBuilder();
                for(Text t : map.get("attachment.content").fragments()){
                    matchContent.append(t.toString());
                docElasticsearchDto.getAttachment().put("content",matchContent.toString());
            if(map.containsKey("fileName")) {
                StringBuilder matchFileName =  new StringBuilder();
                for(Text t : map.get("fileName").fragments()){
                    matchFileName.append(t.toString());
                docElasticsearchDto.setFileName(matchFileName.toString());
            docElasticsearchList.add(docElasticsearchDto);
        rspData.setTotal(totalHits);
        rspData.setRows(docElasticsearchList);
        return rspData;
    @Override
    public void deleteFileIndex(String id) throws IOException {
        //查询系统参数,ES索引名称
        String esIndex = sysConfigService.selectConfigByKey("es_index");
        //删除索引
        DeleteRequest deleteRequest = new DeleteRequest(esIndex,id);
        esClient.delete(deleteRequest, RequestOptions.DEFAULT);

示例的实现是先从数据库查询到保存的文件信息,然后从minio文件存储服务器获取文件流,由于minio提供以签名的方式获取流,这里就直接使用了,你也可以是其他服务器,或者直接获取文件对象。在创建索引的时候直接发送文件流,Ingest-Attachment插件会帮你实现转换。Ingest-Attachment的安装可参考Docker安装Elasticsearch及相关插件详细步骤,全程亲测避坑_冰糖码奇朵的博客-CSDN博客

对存储在MinIO服务器的文件实现全文检索。也可以是其他服务器或本地文件,本文仅详细介绍MinIO文件的读取及转换。通过Elasticsearch的Ingest-Attachment插件抽取文件内容,支持Word、Excel、PDF、TXT等格式文件,无需手动解析文件内容。springboot已经管理好了依赖,只需引入spring-boot-starter-data-elasticsearch。配置类ElasticSearchConfig。