您现在的位置： Linux教程網 >> UnixLinux > >> Linux編程 >> Linux編程

Lucene給文本索引和搜索功能的應用

最近一段時間由於公司需要，模糊搜索出相似的關鍵詞，所以直接考慮使用了Lucene。

Lucene允許你往程序中添加搜索功能，Lucene能夠把你從文本中解析出來的數據進行索引和搜索，Lucene不關心數據來源甚至不關心語種，不過你需要把它轉換成文本格式。也就是說你可以搜索 html網頁，文本文檔，word文檔，pdf，或者其他一些總之只要能夠提取出文本信息的即可。同樣你也可以利用Lucene來索引存儲在數據庫中的數據，以給你的用戶提供一些比如全文搜索功能等，反正Lucene的功能很是強大。裡面還有很多開源的對不同語言進行分析的插件等。

下面我介紹一個例子，這裡我進行對一個txt文檔的每一行進行了索引的添加，也就是說把每一行當作一個document對象來處理，實際上在Lucene中每一個document 相當於我們在數據庫中的庫名，而每個field相當於我們的表名，它能夠對文本進行自動處理去掉裡面的一些語氣詞，它能把你規定的域當作關鍵詞來進行索引以備查詢時使用，Lucene比較容易使用，但是不如數據庫靈活，速度很快。下面我用一個例子來說明（這裡我用的Lucene4.7.2,最高版本，你需要注意把需要的一些jar包引入的到你的工程中，使用maven可直接引入依賴http://mvnrepository.com/artifact/org.apache.Lucene需要的全部引入）我這裡寫了一個實例你可以進行參考學習使用方法。

--------------------------------------分割線 --------------------------------------

基於Lucene多索引進行索引和搜索 http://www.linuxidc.com/Linux/2012-05/59757.htm

Lucene 實戰(第2版) 中文版配套源代碼 http://www.linuxidc.com/Linux/2013-10/91055.htm

Lucene 實戰(第2版) PDF高清中文版 http://www.linuxidc.com/Linux/2013-10/91052.htm

使用Lucene-Spatial實現集成地理位置的全文檢索 http://www.linuxidc.com/Linux/2012-02/53117.htm

Lucene + Hadoop 分布式搜索運行框架 Nut 1.0a9 http://www.linuxidc.com/Linux/2012-02/53113.htm

Lucene + Hadoop 分布式搜索運行框架 Nut 1.0a8 http://www.linuxidc.com/Linux/2012-02/53111.htm

Lucene + Hadoop 分布式搜索運行框架 Nut 1.0a7 http://www.linuxidc.com/Linux/2012-02/53110.htm

Project 2-1: 配置Lucene, 建立WEB查詢系統[Ubuntu 10.10] http://www.linuxidc.com/Linux/2010-11/30103.htm

--------------------------------------分割線 --------------------------------------

package lucene.home.clq;

/**
* @author chenlongquan
* Copyright Manning Publications Co..com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/

//創建索引
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* This code was originally build for the index
*
*/
public class Indexer {

public static void main(String[] args) throws Exception {

String indexDir = "f:\\index"; //1
String dataDir = "f:\\baidu"; //2

long start = System.currentTimeMillis();
Indexer indexer = new Indexer(indexDir);
int numIndexed;
try {
numIndexed = indexer.index(dataDir, new TextFilesFilter());
} finally {
indexer.close();
}
long end = System.currentTimeMillis();

System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
}
private IndexWriter writer;
public Indexer(String indexDir) throws IOException {
Directory dir = FSDirectory.open(new File(indexDir));
writer = new IndexWriter(dir,indexWriterConfig());
//在這裡進行索引的調試
}

public void close() throws IOException {
writer.close(); //4
}
private IndexWriterConfig indexWriterConfig()
{
Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer);
return config;
}
public int index(String dataDir, FileFilter filter)
throws Exception {

File[] files = new File(dataDir).listFiles();

for (File f: files) {
if (!f.isDirectory() &&
!f.isHidden() &&
f.exists() &&
f.canRead() &&
(filter == null || filter.accept(f))) {
indexFile(f);
}
}

return writer.numDocs(); //5
}

private static class TextFilesFilter implements FileFilter {
public boolean accept(File path) {
return path.getName().toLowerCase() //6
.endsWith(".txt"); //6
}
}

/**
* 遍歷每一個文件，然後讀出文件中的每一行數據，當成一個document來處理
* @param f
* @throws Exception
*/
private void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
// Document doc = getDocument(f);
List<String> lists = readFileNoDup(f);
for(String list:lists){
Document doc = new Document();
doc.add(new Field("contents",list,TextField.TYPE_STORED));
writer.addDocument(doc);
}

//10
}
//讀取一個文件
private List<String> readFile(File filePathAndName)throws IOException {

FileInputStream fis = new FileInputStream(filePathAndName);
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
BufferedReader br = new BufferedReader(isr);
LineNumberReader lnr = new LineNumberReader(br);

List<String> returnValue = new ArrayList<String>();
int cnt = 0;
while (true) {
cnt++;
String tempStr = lnr.readLine();
if (tempStr == null)
break;
if (tempStr.length() < 2)
continue;
returnValue.add(tempStr);
}
lnr.close();
br.close();
isr.close();
fis.close();
return returnValue;
}
//讀取一個文件並排重後返回
public static List<String> readFileNoDup(File filePathAndName)
throws IOException {

FileInputStream fis = new FileInputStream(filePathAndName);
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
BufferedReader br = new BufferedReader(isr);
LineNumberReader lnr = new LineNumberReader(br);

Set<String> set = new HashSet<String>();
while (true) {
String tempStr = lnr.readLine();
if (tempStr == null)
break;
if (tempStr.length() < 2)
continue;
set.add(tempStr.trim());
}
lnr.close();
br.close();
isr.close();
fis.close();
List<String> returnValue = new ArrayList<String>(set.size());
returnValue.addAll(set);
return returnValue;
}
}

//對剛才已經建好的索引進行搜索

package lucene.home.clq;

/**
* Copyright Manning Publications Co.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

// From chapter 1

/**
* This code was originally written for searcher
*
*/
public class Searcher {

public static void main(String[] args) throws IllegalArgumentException,
IOException, ParseException {

final String indexDir = "e:\\soso\\soso";
String q = " ";//輸入你添加的所以進行模糊搜索
docs = query(indexDir, q)

}

public static void search(String indexDir, String q)
throws IOException, ParseException {
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));
// Directory dir = FSDirectory.open(new File(indexDir)); //3
IndexSearcher is = new IndexSearcher(reader); //3

QueryParser parser = new QueryParser(Version.LUCENE_47,"contents",new SmartChineseAnalyzer(Version.LUCENE_47));
Query query = parser.parse(q); //4
long start = System.currentTimeMillis();
TopDocs hits = is.search(query, 500); //5

//ScoreDoc[] hits = is.search(query, null, 10).scoreDocs;
long end = System.currentTimeMillis();

System.err.println("Found " + hits.totalHits + //6
" document(s) (in " + (end - start) + // 6
" milliseconds) that matched query '" + // 6
q + "':"); // 6

for(ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc); //7
System.out.println(doc.get("contents"));
}
reader.close();
}

private static List<String> query(String indexDir, String searcher) throws IOException, ParseException{
if (searcher == null || searcher.length() == -1) {
return null;
}

searcher = searcher.trim();
if (searcher.length() == 0) {
return null;
}

IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));//open the index
//IndexReader reader = DirectoryReader.open(SimpleFSDirectory.open(new File(indexDir)));//open the index
IndexSearcher is = new IndexSearcher(reader);//find the content
QueryParser parser = new QueryParser(Version.LUCENE_47, "contents", new SmartChineseAnalyzer(Version.LUCENE_47));//parser the content
Query query = parser.parse(searcher);
TopFieldDocs hits = is.search(query, 100, new Sort(new SortField("contents", SortField.Type.SCORE, false)));
TopDocs hits1 = is.search(query, 200);//搜索出前200條數據按照評分進行排序
List<String> list = new ArrayList<String>();
for(ScoreDoc scoreDoc : hits.scoreDocs){
Document doc = is.doc(scoreDoc.doc);
list.add(doc.get("contents"));
}
reader.close();
return list;
}
}

//這裡我主要給文檔中的文本進行添加了索引，你也可以在Field 中給路徑等等一些屬性進行添加索引具體你可以搜索lucene api

進行使用裡面的一些方法。我這裡說的比較粗，有問題歡迎討論。

Lucene 的詳細介紹：請點這裡
Lucene 的下載地址：請點這裡

上一篇文章： Jfinal學習之路---Controller使用
下一篇文章： Java中IO寫文件工具類

Linux編程

FreeBSD的功能特點和應用范圍

基於Lucene多索引進行索引和搜索

為你的 Android 應用增加本地搜索功能

從HAL層到Java應用層重寫mini6410 LED功能

jQuery文本框中的事件應用

Android應用技巧之給文本加邊框

Android應用自動更新功能的實現

Android提供語音搜索功能

相關文章

最大搜索引擎Google“封殺”hao123

應用服務器稱雄Linux功能服務器應用市場

新的搜索網頁文本的高速算法

FreeBSD的功能特點-FreeBSD的特點和應用范圍

FreeBSD的功能特點-FreeBSD的特點和應用范圍

Vi文本編輯下的命令功能

JavaFX本地應用自動更新功能的實現FXLauncher

Android實現仿IOS帶清空功能的文本輸入框

iOS6.0 新功能：在應用程序內展示App Store

Android 應用開發筆記 - 文本框（TextView）

為Android應用程序添加Rate功能

基於Android浮動組件可用於應用中新功能展示等

Linux編程

SHELL編程

PERL編程