如何实现模糊查询_表格存储-阿里云帮助中心

对于通配符查询（WildcardQuery）中查询模式为 *word* 的场景，您可以使用模糊分词方式（即模糊分词和短语匹配查询组合使用）来实现性能更好的模糊查询。

背景信息

模糊查询是数据库业务中常见的需求，例如查询文件名、手机号码等。在表格存储中要实现模糊查询，通常使用多元索引的通配符查询来实现类似于MySQL中的like功能，但是通配符查询存在查询词长度有限制（最长32个字符）以及性能会随着数据量增长而下降的限制。

为了解决通配符查询存在的问题，多元索引支持使用模糊分词方式来实现性能更好的模糊查询。当使用模糊分词方式时，查询词长度无限制，但是原文内容会限制最大1024字符或者汉字，超过后会截断，只保留前1024个字符或者汉字。

适用场景

请根据查询场景选择合适的方式实现模糊查询。

对于通配符查询中查询模式为 *word* 的场景，例如通过 "123" 匹配手机号码中任意位置包含 123 的号码，请使用模糊分词方式来实现模糊查询。
在此场景中，大部分情况下使用模糊分词方式会比使用通配符查询有10倍以上的性能提升。

假设数据表中包含file_name列，该列在多元索引中的字段类型为Text且分词类型为模糊分词（Fuzzy_Analyzer）。如果使用多元索引查询需要查询到file_name列值为 2021 woRK@杭州 的行，则查询时必须使用短语匹配查询（MatchPhraseQuery）且设置查询词为位置连续的子字符串。
- 如果查询词为 2021 、 20 、 21 、 work 、 WORK 、 @ 、 杭 、 州 、 杭州 、 @杭州 中的任意一个，则可以匹配到file_name列值为 2021 woRK@杭州 的行。
- 如果查询词为 21work 、 2021杭州 、 2120 、 #杭州 中的任意一个，则无法匹配到file_name列值为 2021 woRK@杭州 的行。
对于其他复杂查询场景，请使用通配符查询方式来实现模糊查询。更多信息，请参见通配符查询。

使用方式

使用模糊分词方式实现模糊查询的具体步骤如下：

创建多元索引时，指定列类型为Text且分词类型为模糊分词（Fuzzy Analyzer），其他参数保持默认配置即可。

package com.aliyun.tablestore.search.test;
import com.alicloud.openservices.tablestore.SyncClient;
import com.alicloud.openservices.tablestore.model.*;
import com.alicloud.openservices.tablestore.model.search.*;
import com.alicloud.openservices.tablestore.model.search.query.QueryBuilders;
import org.junit.Test;
import java.util.Arrays;
import java.util.Collections;
import static org.junit.Assert.assertEquals;
public class Test {
    private static final Conf conf = Conf.newInstance("src/test/resources/conf.json");
    private static final SyncClient ots = new SyncClient(conf.getEndpoint(), conf.getAccessId(), conf.getAccessKey(), conf.getInstanceName());
    private static final String tableName = "analysis_test";
    private static final String indexName = "analysis_test_index";
    @Test
    public void testFuzzyMatchPhrase() {
        // 清理表和索引。
        TableStoreHelper.deleteTableAndIndex(ots, tableName);
        // 创建表。
        TableStoreHelper.createTable(ots, tableName);
        // 定义表schema。
        IndexSchema indexSchema = new IndexSchema();
        indexSchema.setFieldSchemas(Collections.singletonList(
                // 注意：当原来查询的name字段为Keyword类型时，如果修改该字段为Text类型并为该字段设置分词后，查询可能会出现异常。
                // 如果需要同时保留Keyword和Text类型，请参见“虚拟列”功能的示例。假如使用name字段只需要完成匹配*abc*的查询功能，则只用Text类型的字段即可，无需Keyword类型。
                new FieldSchema("name", FieldType.TEXT).setAnalyzer(FieldSchema.Analyzer.Fuzzy)
        // 创建多元索引。
        TableStoreHelper.createIndex(ots, tableName, indexName, indexSchema);
        // 写入一行数据。
        PrimaryKey primaryKey = PrimaryKeyBuilder.createPrimaryKeyBuilder()
                .addPrimaryKeyColumn("pk1", PrimaryKeyValue.fromString("1"))
                .addPrimaryKeyColumn("pk2", PrimaryKeyValue.fromLong(1))
                .addPrimaryKeyColumn("pk3", PrimaryKeyValue.fromBinary(new byte[]{1, 2, 3}))
                .build();
        RowPutChange rowPutChange = new RowPutChange(tableName, primaryKey);
        // 写入属性列。
        rowPutChange.addColumn("name", ColumnValue.fromString("调音师1024x768P.mp4"));
        PutRowRequest request = new PutRowRequest(rowPutChange);
        ots.putRow(request);
        // 等待多元索引中同步完成一条数据。
        TableStoreHelper.waitDataSync(ots, tableName, indexName, 1);
        // 匹配*abc*的查询功能场景展示。
        assertMatchPhraseQuery(ots, tableName, indexName, "name", "调", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name", "调音", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name", "调 音", 0);
        assertMatchPhraseQuery(ots, tableName, indexName, "name", "调音师102", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name", "调音师1024", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name", "调音师1024x", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name", "调音师1024x7", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name", "调音师1024x768P.mp4", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name", "24x768P.mp4", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name", "24x76 8P.mp4", 0);
        assertMatchPhraseQuery(ots, tableName, indexName, "name", "24x7 P.mp4", 0);
    @Test
    // 使用虚拟列。
    public void testFuzzyMatchPhraseWithVirtualField() {
        // 清理表和索引。
        TableStoreHelper.deleteTableAndIndex(ots, tableName);
        // 创建数据表。
        TableStoreHelper.createTable(ots, tableName);
        // 定义表schema。
        IndexSchema indexSchema = new IndexSchema();
        indexSchema.setFieldSchemas(Arrays.asList(
                // 原始字段为Keyword类型，方便进行等值查询。
                new FieldSchema("name", FieldType.KEYWORD).setIndex(true).setStore(true),
                // 创建一个虚拟列“name_virtual_text”，同时设置虚拟列为Text类型且分词类型为Fuzzy。该虚拟列的来源为“name”字段。
                new FieldSchema("name_virtual_text", FieldType.TEXT).setIndex(true).setAnalyzer(FieldSchema.Analyzer.Fuzzy).setVirtualField(true).setSourceFieldName("name")
        // 创建多元索引。
        TableStoreHelper.createIndex(ots, tableName, indexName, indexSchema);
        // 写入一行数据。
        PrimaryKey primaryKey = PrimaryKeyBuilder.createPrimaryKeyBuilder()
                .addPrimaryKeyColumn("pk1", PrimaryKeyValue.fromString("1"))
                .addPrimaryKeyColumn("pk2", PrimaryKeyValue.fromLong(1))
                .addPrimaryKeyColumn("pk3", PrimaryKeyValue.fromBinary(new byte[]{1, 2, 3}))
                .build();
        RowPutChange rowPutChange = new RowPutChange(tableName, primaryKey);
        // 写入属性列。
        rowPutChange.addColumn("name", ColumnValue.fromString("调音师1024x768P.mp4"));
        PutRowRequest request = new PutRowRequest(rowPutChange);
        ots.putRow(request);
        // 等待多元索引中同步完成一条数据。
        TableStoreHelper.waitDataSync(ots, tableName, indexName, 1);
        // 配置*abc*的查询场景展示。
        // 请注意查询字段为虚拟列“name_virtual_text”，而不是“name”。
        assertMatchPhraseQuery(ots, tableName, indexName, "name_virtual_text", "调", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name_virtual_text", "调音", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name_virtual_text", "调 音", 0);
        assertMatchPhraseQuery(ots, tableName, indexName, "name_virtual_text", "调音师102", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name_virtual_text", "调音师1024", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name_virtual_text", "调音师1024x", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name_virtual_text", "调音师1024x7", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name_virtual_text", "调音师1024x768P.mp4", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name_virtual_text", "24x768P.mp4", 1);
        assertMatchPhraseQuery(ots, tableName, indexName, "name_virtual_text", "24x76 8P.mp4", 0);
        assertMatchPhraseQuery(ots, tableName, indexName, "name_virtual_text", "24x7 P.mp4", 0);
    // 展示MatchPhraseQuery如何实现。
    public static void assertMatchPhraseQuery(SyncClient ots, String tableName, String indexName, String fieldName, String searchContent, long exceptCount) {
        SearchRequest searchRequest = new SearchRequest();
        searchRequest.setTableName(tableName);
        searchRequest.setIndexName(indexName);
        SearchQuery searchQuery = new SearchQuery();
        // 使用MatchPhraseQuery查询分词字段。
        searchQuery.setQuery(QueryBuilders.matchPhrase(fieldName, searchContent).build());
        searchQuery.setLimit(0);
        // 为了展示功能需要，此处设置返回匹配总行数。如果不需要关心匹配总行数，请设置为false，来实现更高性能。
        searchQuery.setGetTotalCount(true);
        searchRequest.setSearchQuery(searchQuery);
        SearchResponse response = ots.search(searchRequest);
        assertEquals(String.format("field:[%s], searchContent:[%s]", fieldName, searchContent), exceptCount, response.getTotalCount());