首页 > 解决方案 > 为什么使用非拉丁文本的 Lucene 模糊搜索更容易导致 TooComplexToDeterminizeException?

问题描述

我有以下执行模糊查询的测试用例。两种情况都插入了一些“Lorem Ipsum”文本,但一种是拉丁字母,另一种使用西里尔字母。

但是,Cyrillic 测试用例会引发异常

org.apache.lucene.util.automaton.TooComplexToDeterminizeException:确定具有 34479 个状态和 58454 个转换的自动机将导致超过 10000 个状态。

而拉丁语则没有,尽管它包含更多的字符和单词。

如果我将 Cyrillic case 的最大编辑距离减小到1,它可以正常工作。但我想知道为什么这个问题首先出现在一个字母而不是另一个。我还注意到其他字母(如梵文)的类似行为。

无论输入语言如何,我都可以做些什么来避免这些问题,但同时仍保持最大编辑距离2

该测试使用 JUnit 4.12 和 Lucene Core + Common Analyzers 8.3.0 编写。

import static org.junit.Assert.assertEquals;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ru.RussianAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.junit.Test;

public class LuceneTest
{
  private static final String ID_FIELD = "ID";
  private static final String NAME_FIELD = "NAME";

  @Test
  public void testLatin() throws IOException
  {
    Analyzer analyzer = new StandardAnalyzer();

    String latin = "Lorem ipsum dolor sit amet, has ex bonorum scripserit,"
        + " graeco volutpat aliquando ut eum, nec partem evertitur maiestatis ea."
        + " Ad vim aliquam dignissim. Eu audire delectus eum, vel eu viderer democritum voluptatum."
        + " Eu ludus utinam fabulas vim, mei te accumsan conceptam, at sea possit aperiam tacimates."
        + " Sed periculis repudiare ut."
        + " Lorem ipsum dolor sit amet, has ex bonorum scripserit,"
        + " graeco volutpat aliquando ut eum, nec partem evertitur maiestatis ea."
        + " Ad vim aliquam dignissim. Eu audire delectus eum, vel eu viderer democritum voluptatum."
        + " Eu ludus utinam fabulas vim, mei te accumsan conceptam, at sea possit aperiam tacimates."
        + " Sed periculis repudiare ut.";

    // This completes successfully.
    runLuceneTest( latin, latin, analyzer, 2 );
  }

  @Test
  public void testCyrillic() throws IOException
  {
    Analyzer analyzer = new RussianAnalyzer();

    String cyrillic = "Лорем ипсум долор сит амет, елитр ностер дисцере цу нам, ет вих мунере лаборес феугаит."
        + " Фалли делицатиссими усу еа, еа ферри солеат диссентиет ест, новум лаудем усу еу."
        + " Цу при малис путент мнесарчум.";

    // This throws an execption with edit distance 2.
    runLuceneTest( cyrillic, cyrillic, analyzer, 2 );
  }

  private void runLuceneTest( String insertText, String searchText, Analyzer analyzer, int maxEditDistance )
      throws IOException
  {
    Directory index = new RAMDirectory();
    IndexWriterConfig config = new IndexWriterConfig( analyzer );

    try ( IndexWriter w = new IndexWriter( index, config ) )
    {
      w.addDocument( createDoc( 42, insertText ) );
      w.commit();
    }

    List<Document> result = search( index, searchText, maxEditDistance );
    assertEquals( "Number of results", 1, result.size() );
    assertEquals( "Document ID", 42, Integer.parseInt( result.get( 0 ).getField( ID_FIELD ).stringValue() ) );
  }

  private Document createDoc( long id, String name )
  {
    Document doc = new Document();
    doc.add( new TextField( ID_FIELD, Long.toString( id ), Store.YES ) );
    doc.add( new StringField( NAME_FIELD, name, Store.NO ) );
    return doc;
  }

  private List<Document> search( Directory index, String name, int maxEditDistance ) throws IOException
  {
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add( new FuzzyQuery( new Term( NAME_FIELD, name ), maxEditDistance ), Occur.SHOULD );
    BooleanQuery query = builder.build();

    int maxHits = 50;
    try ( IndexReader reader = DirectoryReader.open( index ) )
    {
      IndexSearcher searcher = new IndexSearcher( reader );
      TopDocs docs = searcher.search( query, maxHits );
      ScoreDoc[] hits = docs.scoreDocs;
      List<Document> result = new ArrayList<>();
      for ( ScoreDoc hit : hits )
      {
        result.add( searcher.doc( hit.doc ) );
      }
      return result;
    }
  }

}

标签: javaluceneinternationalization

解决方案


推荐阅读