当前位置:  开发笔记 > 编程语言 > 正文

如何在Lucene中查询自动完成/建议?

如何解决《如何在Lucene中查询自动完成/建议?》经验,为你挑选了3个好方法。

我正在寻找一种在Lucene中进行查询自动完成/建议的方法.我用Google搜索了一下并玩了一下,但我见过的所有例子似乎都是在Solr中设置过滤器.我们不使用Solr,并且不打算在不久的将来转向使用Solr,而Solr显然只是在Lucene周围,所以我想必须有办法做到这一点!

我已经研究过使用EdgeNGramFilter了,我意识到我必须在索引字段上运行过滤器并获取令牌,然后将它们与输入的Query进行比较......我只是在努力建立连接这两个代码,所以非常感谢帮助!

要清楚我正在寻找什么(我意识到我不是太清楚,对不起) - 我正在寻找一个解决方案,当搜索一个术语时,它会返回一个建议查询列表.当在搜索字段中键入"inter"时,它将返回一个建议查询列表,例如"internet","international"等.



1> Mat Mannion..:

根据@Alexandre Victoor的回答,我在contrib包中使用Lucene Spellchecker编写了一个小类(并使用其中包含的LuceneDictionary),它完全符合我的要求.

这允许使用单个字段从单个源索引重新索引,并提供术语建议.结果按原始索引中与该术语匹配的文档数进行排序,因此首先显示更受欢迎的术语.似乎工作得很好:)

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.ISOLatin1AccentFilter;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.spell.LuceneDictionary;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

/**
 * Search term auto-completer, works for single terms (so use on the last term
 * of the query).
 * 

* Returns more popular terms first. * * @author Mat Mannion, M.Mannion@warwick.ac.uk */ public final class Autocompleter { private static final String GRAMMED_WORDS_FIELD = "words"; private static final String SOURCE_WORD_FIELD = "sourceWord"; private static final String COUNT_FIELD = "count"; private static final String[] ENGLISH_STOP_WORDS = { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "i", "if", "in", "into", "is", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; private final Directory autoCompleteDirectory; private IndexReader autoCompleteReader; private IndexSearcher autoCompleteSearcher; public Autocompleter(String autoCompleteDir) throws IOException { this.autoCompleteDirectory = FSDirectory.getDirectory(autoCompleteDir, null); reOpenReader(); } public List suggestTermsFor(String term) throws IOException { // get the top 5 terms for query Query query = new TermQuery(new Term(GRAMMED_WORDS_FIELD, term)); Sort sort = new Sort(COUNT_FIELD, true); TopDocs docs = autoCompleteSearcher.search(query, null, 5, sort); List suggestions = new ArrayList(); for (ScoreDoc doc : docs.scoreDocs) { suggestions.add(autoCompleteReader.document(doc.doc).get( SOURCE_WORD_FIELD)); } return suggestions; } @SuppressWarnings("unchecked") public void reIndex(Directory sourceDirectory, String fieldToAutocomplete) throws CorruptIndexException, IOException { // build a dictionary (from the spell package) IndexReader sourceReader = IndexReader.open(sourceDirectory); LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete); // code from // org.apache.lucene.search.spell.SpellChecker.indexDictionary( // Dictionary) IndexReader.unlock(autoCompleteDirectory); // use a custom analyzer so we can do EdgeNGramFiltering IndexWriter writer = new IndexWriter(autoCompleteDirectory, new Analyzer() { public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new ISOLatin1AccentFilter(result); result = new StopFilter(result, ENGLISH_STOP_WORDS); result = new EdgeNGramTokenFilter( result, Side.FRONT,1, 20); return result; } }, true); writer.setMergeFactor(300); writer.setMaxBufferedDocs(150); // go through every word, storing the original word (incl. n-grams) // and the number of times it occurs Map wordsMap = new HashMap(); Iterator iter = (Iterator) dict.getWordsIterator(); while (iter.hasNext()) { String word = iter.next(); int len = word.length(); if (len < 3) { continue; // too short we bail but "too long" is fine... } if (wordsMap.containsKey(word)) { throw new IllegalStateException( "This should never happen in Lucene 2.3.2"); // wordsMap.put(word, wordsMap.get(word) + 1); } else { // use the number of documents this word appears in wordsMap.put(word, sourceReader.docFreq(new Term( fieldToAutocomplete, word))); } } for (String word : wordsMap.keySet()) { // ok index the word Document doc = new Document(); doc.add(new Field(SOURCE_WORD_FIELD, word, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term doc.add(new Field(GRAMMED_WORDS_FIELD, word, Field.Store.YES, Field.Index.TOKENIZED)); // grammed doc.add(new Field(COUNT_FIELD, Integer.toString(wordsMap.get(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)); // count writer.addDocument(doc); } sourceReader.close(); // close writer writer.optimize(); writer.close(); // re-open our reader reOpenReader(); } private void reOpenReader() throws CorruptIndexException, IOException { if (autoCompleteReader == null) { autoCompleteReader = IndexReader.open(autoCompleteDirectory); } else { autoCompleteReader.reopen(); } autoCompleteSearcher = new IndexSearcher(autoCompleteReader); } public static void main(String[] args) throws Exception { Autocompleter autocomplete = new Autocompleter("/index/autocomplete"); // run this to re-index from the current index, shouldn't need to do // this very often // autocomplete.reIndex(FSDirectory.getDirectory("/index/live", null), // "content"); String term = "steve"; System.out.println(autocomplete.suggestTermsFor(term)); // prints [steve, steven, stevens, stevenson, stevenage] } }


请注意,这是为较旧版本的Lucene创建的.在当前版本(4.4.0)中,在Analyzer类上实现的抽象方法是createComponents(String fieldName,Reader reader).见http://lucene.apache.org/core/4_4_0/core/org/apache/lucene/analysis/Analyzer.html

2> ThisIsTheDav..:

这是Mat的实现到Lucene.NET的C#的音译,以及使用jQuery的自动完成功能连接文本框的片段.


... JQuery自动完成:

// don't navigate away from the field when pressing tab on a selected item
$( "#search-input" ).keydown(function (event) {
    if (event.keyCode === $.ui.keyCode.TAB && $(this).data("autocomplete").menu.active) {
        event.preventDefault();
    }
});

$( "#search-input" ).autocomplete({
    source: '@Url.Action("SuggestTerms")', // <-- ASP.NET MVC Razor syntax
    minLength: 2,
    delay: 500,
    focus: function () {
        // prevent value inserted on focus
        return false;
    },
    select: function (event, ui) {
        var terms = this.value.split(/\s+/);
        terms.pop(); // remove dropdown item
        terms.push(ui.item.value.trim()); // add completed item
        this.value = terms.join(" "); 
        return false;
    },
 });

...这是ASP.NET MVC控制器代码:

    //
    // GET: /MyApp/SuggestTerms?term=something
    public JsonResult SuggestTerms(string term)
    {
        if (string.IsNullOrWhiteSpace(term))
            return Json(new string[] {});

        term = term.Split().Last();

        // Fetch suggestions
        string[] suggestions = SearchSvc.SuggestTermsFor(term).ToArray();

        return Json(suggestions, JsonRequestBehavior.AllowGet);
    }

...这里是C#中Mat的代码:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Lucene.Net.Store;
using Lucene.Net.Index;
using Lucene.Net.Search;
using SpellChecker.Net.Search.Spell;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Analysis.NGram;
using Lucene.Net.Documents;

namespace Cipher.Services
{
    /// 
    /// Search term auto-completer, works for single terms (so use on the last term of the query).
    /// Returns more popular terms first.
    /// 
/// Author: Mat Mannion, M.Mannion@warwick.ac.uk /// ///
/// public class SearchAutoComplete { public int MaxResults { get; set; } private class AutoCompleteAnalyzer : Analyzer { public override TokenStream TokenStream(string fieldName, System.IO.TextReader reader) { TokenStream result = new StandardTokenizer(kLuceneVersion, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new ASCIIFoldingFilter(result); result = new StopFilter(false, result, StopFilter.MakeStopSet(kEnglishStopWords)); result = new EdgeNGramTokenFilter( result, Lucene.Net.Analysis.NGram.EdgeNGramTokenFilter.DEFAULT_SIDE,1, 20); return result; } } private static readonly Lucene.Net.Util.Version kLuceneVersion = Lucene.Net.Util.Version.LUCENE_29; private static readonly String kGrammedWordsField = "words"; private static readonly String kSourceWordField = "sourceWord"; private static readonly String kCountField = "count"; private static readonly String[] kEnglishStopWords = { "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "i", "if", "in", "into", "is", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; private readonly Directory m_directory; private IndexReader m_reader; private IndexSearcher m_searcher; public SearchAutoComplete(string autoCompleteDir) : this(FSDirectory.Open(new System.IO.DirectoryInfo(autoCompleteDir))) { } public SearchAutoComplete(Directory autoCompleteDir, int maxResults = 8) { this.m_directory = autoCompleteDir; MaxResults = maxResults; ReplaceSearcher(); } /// /// Find terms matching the given partial word that appear in the highest number of documents. /// A word or part of a word /// A list of suggested completions public IEnumerable SuggestTermsFor(string term) { if (m_searcher == null) return new string[] { }; // get the top terms for query Query query = new TermQuery(new Term(kGrammedWordsField, term.ToLower())); Sort sort = new Sort(new SortField(kCountField, SortField.INT)); TopDocs docs = m_searcher.Search(query, null, MaxResults, sort); string[] suggestions = docs.ScoreDocs.Select(doc => m_reader.Document(doc.Doc).Get(kSourceWordField)).ToArray(); return suggestions; } /// /// Open the index in the given directory and create a new index of word frequency for the /// given index. /// Directory containing the index to count words in. /// The field in the index that should be analyzed. public void BuildAutoCompleteIndex(Directory sourceDirectory, String fieldToAutocomplete) { // build a dictionary (from the spell package) using (IndexReader sourceReader = IndexReader.Open(sourceDirectory, true)) { LuceneDictionary dict = new LuceneDictionary(sourceReader, fieldToAutocomplete); // code from // org.apache.lucene.search.spell.SpellChecker.indexDictionary( // Dictionary) //IndexWriter.Unlock(m_directory); // use a custom analyzer so we can do EdgeNGramFiltering var analyzer = new AutoCompleteAnalyzer(); using (var writer = new IndexWriter(m_directory, analyzer, true, IndexWriter.MaxFieldLength.LIMITED)) { writer.MergeFactor = 300; writer.SetMaxBufferedDocs(150); // go through every word, storing the original word (incl. n-grams) // and the number of times it occurs foreach (string word in dict) { if (word.Length < 3) continue; // too short we bail but "too long" is fine... // ok index the word // use the number of documents this word appears in int freq = sourceReader.DocFreq(new Term(fieldToAutocomplete, word)); var doc = MakeDocument(fieldToAutocomplete, word, freq); writer.AddDocument(doc); } writer.Optimize(); } } // re-open our reader ReplaceSearcher(); } private static Document MakeDocument(String fieldToAutocomplete, string word, int frequency) { var doc = new Document(); doc.Add(new Field(kSourceWordField, word, Field.Store.YES, Field.Index.NOT_ANALYZED)); // orig term doc.Add(new Field(kGrammedWordsField, word, Field.Store.YES, Field.Index.ANALYZED)); // grammed doc.Add(new Field(kCountField, frequency.ToString(), Field.Store.NO, Field.Index.NOT_ANALYZED)); // count return doc; } private void ReplaceSearcher() { if (IndexReader.IndexExists(m_directory)) { if (m_reader == null) m_reader = IndexReader.Open(m_directory, true); else m_reader.Reopen(); m_searcher = new IndexSearcher(m_reader); } else { m_searcher = null; } } } }



3> user2098849..:

我的代码基于lucene 4.2,可以帮到你

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.LuceneDictionary;
import org.apache.lucene.search.spell.PlainTextDictionary;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.wltea4pinyin.analyzer.lucene.IKAnalyzer4PinYin;


/**
 * 
 * 
 * @author 
 * @version 2013-11-25??11:13:59
 */
public class LuceneSpellCheckerDemoService {

private static final String INDEX_FILE = "/Users/r/Documents/jar/luke/youtui/index";
private static final String INDEX_FILE_SPELL = "/Users/r/Documents/jar/luke/spell";

private static final String INDEX_FIELD = "app_name_quanpin";

public static void main(String args[]) {

    try {
        //
        PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new IKAnalyzer4PinYin(
                true));

        //  read index conf
        IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_42, wrapper);
        conf.setOpenMode(OpenMode.CREATE_OR_APPEND);

        // read dictionary
        Directory directory = FSDirectory.open(new File(INDEX_FILE));
        RAMDirectory ramDir = new RAMDirectory(directory, IOContext.READ);
        DirectoryReader indexReader = DirectoryReader.open(ramDir);

        Dictionary dic = new LuceneDictionary(indexReader, INDEX_FIELD);


        SpellChecker sc = new SpellChecker(FSDirectory.open(new File(INDEX_FILE_SPELL)));
        //sc.indexDictionary(new PlainTextDictionary(new File("myfile.txt")), conf, false);
        sc.indexDictionary(dic, conf, true);
        String[] strs = sc.suggestSimilar("zhsiwusdazhanjiangshi", 10);
        for (int i = 0; i < strs.length; i++) {
            System.out.println(strs[i]);
        }
        sc.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}


}

推荐阅读
乐韵答题
这个屌丝很懒,什么也没留下!
DevBox开发工具箱 | 专业的在线开发工具网站    京公网安备 11010802040832号  |  京ICP备19059560号-6
Copyright © 1998 - 2020 DevBox.CN. All Rights Reserved devBox.cn 开发工具箱 版权所有