应用背景:
对存储在MinIO服务器的文件实现全文检索。也可以是其他服务器或本地文件,本文仅详细介绍MinIO文件的读取及转换。通过Elasticsearch的Ingest-Attachment插件抽取文件内容,支持Word、Excel、PDF、TXT等格式文件,无需手动解析文件内容。
上代码,详细解释可以阅读注释、
1.引入依赖
springboot已经管理好了依赖,只需引入spring-boot-starter-data-elasticsearch
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-elasticsearch</artifactId>
</dependency>
2.配置文件
elasticsearch:
host: 192.168.2.154
port: 9200
3.配置类
@Setter
@ConfigurationProperties(prefix = "elasticsearch")
@Configuration
public class ElasticSearchConfig {
private String host;
private Integer port;
@Bean
public RestHighLevelClient restHighLevelClient(){
RestHighLevelClient client = new RestHighLevelClient(RestClient.builder(new HttpHost(this.host, this.port)));
return client;
4.实现类
package com.dmp.document.service.impl;
import com.alibaba.fastjson2.JSONObject;
import com.dmp.common.constant.HttpStatus;
import com.dmp.common.core.page.PageDomain;
import com.dmp.common.core.page.TableDataInfo;
import com.dmp.common.core.page.TableSupport;
import com.dmp.document.domain.dto.DocElasticsearchDto;
import com.dmp.document.domain.entity.DocDocument;
import com.dmp.document.service.DocDocumentService;
import com.dmp.document.service.ElasticsearchService;
import com.dmp.document.service.MinioClientService;
import com.dmp.system.service.ISysConfigService;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.compress.utils.IOUtils;
import org.elasticsearch.action.delete.DeleteRequest;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RequestOptions;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.index.query.BoolQueryBuilder;
import org.elasticsearch.index.query.MultiMatchQueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
import org.elasticsearch.search.fetch.subphase.highlight.HighlightField;
import org.elasticsearch.common.text.Text;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;
import static com.sun.webkit.network.URLs.newURL;
import static org.elasticsearch.client.RequestOptions.DEFAULT;
* @author daixin
* @version 1.0
* @description: TODO
* @date 2022/11/23 17:38
@Slf4j
@Service
public class ElasticsearchServiceImpl implements ElasticsearchService {
@Autowired
private RestHighLevelClient esClient;
@Autowired
private MinioClientService minioClientService;
@Autowired
private DocDocumentService docDocumentService;
@Autowired
private ISysConfigService sysConfigService;
public String createFileIndex(String id, String projectId) throws Exception {
String result = null;
InputStream is = null;
Date date1 = new Date();
//查询系统内存储的文件key
DocDocument docDocument = docDocumentService.getById(id);
String path = docDocument.getPath();
//获取minio下载签名
String url = minioClientService.getDownloadLink("file-bucket",path);
//请求minio获取文件流
URL url2= newURL(url);
HttpURLConnection conn=(HttpURLConnection) url2.openConnection();
conn.setDoInput(true);
conn.connect();
is = conn.getInputStream();
//转码base64
byte[] fileByteStream = IOUtils.toByteArray(is);
String base64String = new String(Base64.getEncoder().encodeToString(fileByteStream).getBytes(), "UTF-8");
//封装ES请求
IndexRequest request;
Map attachmentMap = new HashMap();
attachmentMap.put("data", base64String);
attachmentMap.put("fileName", docDocument.getName());
attachmentMap.put("projectId",projectId);
//查询系统参数
String esIndex = sysConfigService.selectConfigByKey("es_index");
String esPipe = sysConfigService.selectConfigByKey("es_pipe");
//配置查询请求参数
request = new IndexRequest(esIndex);
request.id(String.valueOf(docDocument.getId()));
request.setPipeline(esPipe);//文件抽取管道,需提前创建
request.source(JSONObject.toJSONString(attachmentMap), XContentType.JSON);
IndexResponse response = esClient.index(request, RequestOptions.DEFAULT);
response.status().toString();
Date date2 = new Date();
log.info("创建索引-----耗时:{}ms" , (date2.getTime() - date1.getTime()));
}catch(Exception e){
throw e;
}finally {
is.close();
return result;
@Override
public TableDataInfo matchContent(String content, String projectId) {
//此处为若依框架提供的分页,可改为你自己的分页
PageDomain pageDomain = TableSupport.buildPageRequest();
Integer pageNum = pageDomain.getPageNum();
Integer pageSize = pageDomain.getPageSize();
TableDataInfo rspData = new TableDataInfo();
rspData.setCode(HttpStatus.SUCCESS);
rspData.setMsg("查询成功");
//查询系统参数
String esIndex = sysConfigService.selectConfigByKey("es_index");
SearchRequest searchRequest = new SearchRequest(esIndex);
//布尔查询,检索标题和内容,过滤项目id
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
MultiMatchQueryBuilder matchQueryBuilder = QueryBuilders.multiMatchQuery(content, "attachment.content","fileName");
BoolQueryBuilder boolQueryBuilder = QueryBuilders.boolQuery();
boolQueryBuilder.must(matchQueryBuilder);
if(projectId != null){
boolQueryBuilder.filter(QueryBuilders.termQuery("projectId", projectId));
sourceBuilder.query(boolQueryBuilder);
//配置高亮
HighlightBuilder highlightBuilder = new HighlightBuilder();
highlightBuilder.field("attachment.content"); //content字段高亮
highlightBuilder.field("fileName");//fileName字段高亮
highlightBuilder.preTags("<span style='color:red'>"); //高亮前缀
highlightBuilder.postTags("</span>"); //高亮后缀
sourceBuilder.highlighter(highlightBuilder);
//分页查询
sourceBuilder.from((pageNum-1)*pageSize).size(pageSize);
searchRequest.source(sourceBuilder);
SearchResponse searchResponse = null;
try {
searchResponse = esClient.search(searchRequest, DEFAULT);
} catch (Throwable e) {
//捕捉最高级别异常,确保打印详细信息
e.printStackTrace();
if(searchResponse.getHits() == null){
rspData.setTotal(0);
rspData.setRows(null);
return rspData;
List<DocElasticsearchDto> docElasticsearchList = new ArrayList<>();
Long totalHits = searchResponse.getHits().getTotalHits().value;//匹配总条数,用于分页显示
for (SearchHit hit : searchResponse.getHits()){
//查询结果
String source = hit.getSourceAsString();
DocElasticsearchDto docElasticsearchDto = JSONObject.parseObject(source, DocElasticsearchDto.class);
docElasticsearchDto.setId(hit.getId());
//处理高亮字段
Map<String, HighlightField> map = hit.getHighlightFields();
if(map.containsKey("attachment.content")) {
StringBuilder matchContent = new StringBuilder();
for(Text t : map.get("attachment.content").fragments()){
matchContent.append(t.toString());
docElasticsearchDto.getAttachment().put("content",matchContent.toString());
if(map.containsKey("fileName")) {
StringBuilder matchFileName = new StringBuilder();
for(Text t : map.get("fileName").fragments()){
matchFileName.append(t.toString());
docElasticsearchDto.setFileName(matchFileName.toString());
docElasticsearchList.add(docElasticsearchDto);
rspData.setTotal(totalHits);
rspData.setRows(docElasticsearchList);
return rspData;
@Override
public void deleteFileIndex(String id) throws IOException {
//查询系统参数,ES索引名称
String esIndex = sysConfigService.selectConfigByKey("es_index");
//删除索引
DeleteRequest deleteRequest = new DeleteRequest(esIndex,id);
esClient.delete(deleteRequest, RequestOptions.DEFAULT);
示例的实现是先从数据库查询到保存的文件信息,然后从minio文件存储服务器获取文件流,由于minio提供以签名的方式获取流,这里就直接使用了,你也可以是其他服务器,或者直接获取文件对象。在创建索引的时候直接发送文件流,Ingest-Attachment插件会帮你实现转换。Ingest-Attachment的安装可参考Docker安装Elasticsearch及相关插件详细步骤,全程亲测避坑_冰糖码奇朵的博客-CSDN博客
对存储在MinIO服务器的文件实现全文检索。也可以是其他服务器或本地文件,本文仅详细介绍MinIO文件的读取及转换。通过Elasticsearch的Ingest-Attachment插件抽取文件内容,支持Word、Excel、PDF、TXT等格式文件,无需手动解析文件内容。springboot已经管理好了依赖,只需引入spring-boot-starter-data-elasticsearch。配置类ElasticSearchConfig。