Welcome 微信登录

首页 / 网页编程 / PHP / 用PHP调用Lucene包来实现全文检索

用PHP调用Lucene包来实现全文检索2011-09-19由于工作需要,需要使用PHP实现对网站内大量数量进行全文检索,而且目前最流行的全文检索的搜索 引擎库就是Lucene了,它是Apache Jakarta的一个子项目,并且提供了简单实用的API,用这些API,就可 以对任何基本文本的数据(包括数据库)进行全文检索。

因为PHP本身就支持调用外部Java类,所以先用Java写了一个类,这个类通过调用Lucene的API,实现 了两个方法:

* public String createIndex(String indexDir_path,String dataDir_path)
* public String searchword(String ss,String index_path)

其中createIndex是创建索引方法,传入了两个参数分别是indexDir_path(索引文件的目录), dataDir_path(被索引的文件目录),返回被索引的文件列表字符串,另一个是searchword,通过传入的关 键字参数(ss)对索引进行检索,index_path就是索引文件的目录。返回所有检索到的文件。

这里是源代码,很简单,大家可以参考一下:TxtFileIndexer.java

package TestLucene;

import java.io.File;import java.io.FileReader;
import java.io.Reader;
import java.util.Date;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;

public class TxtFileIndexer ...{

public String test() ...{
return "test is ok hohoho";
}

/**//**
* @param args
*/
public String createIndex(String indexDir_path,String dataDir_path) throws Exception ...{
String result = "";
File indexDir = new File(indexDir_path);
File dataDir = new File (dataDir_path);
Analyzer luceneAnalyzer = new StandardAnalyzer();
File[] dataFiles = dataDir.listFiles();
IndexWriter indexWriter = new IndexWriter(indexDir,luceneAnalyzer,true);
long startTime = new Date().getTime ();
for(int i=0; i < dataFiles.length; i++) ...{
if(dataFiles [i].isFile() && dataFiles[i].getName().endsWith(".html")) ...{
result += "Indexing file" + dataFiles[i].getCanonicalPath()+"<br />";
Document document = new Document();
Reader txtReader = new FileReader(dataFiles[i]);
document.add(Field.Text("path",dataFiles [i].getCanonicalPath()));
document.add(Field.Text ("contents",txtReader));
indexWriter.addDocument(document);
}
}

indexWriter.optimize();
indexWriter.close();
long endTime = new Date().getTime();

result += "It takes"+(endTime- startTime)
+ " milliseconds to create index for the files in directory "
+ dataDir.getPath();
return result;
}

public String searchword(String ss,String index_path) throws Exception ...{
String queryStr = ss;
String result = "Result:<br />";
//This is the directory that hosts the Lucene index
File indexDir = new File (index_path);
FSDirectory directory = FSDirectory.getDirectory (indexDir,false);
IndexSearcher searcher = new IndexSearcher(directory);
if(!indexDir.exists())...{
result = "The Lucene index is not exist";
return result;
}
Term term = new Term ("contents",queryStr.toLowerCase());
TermQuery luceneQuery = new TermQuery (term);
Hits hits = searcher.search(luceneQuery);
for(int i = 0; i < hits.length(); i++)...{
Document document = hits.doc(i);
result += "<br /><a href="getfile.php?w="+ss+"&f="+document.get("path") +"">File: " + document.get("path")+"</a>n";
}
return result;
}

}