Simple Lucene Indexing practice

This is very simple practice of Lucene Indexing base on the tutorial written by Thomas Paul. http://www.javaranch.com/journal/2004/04/Lucene.html

 

import java.io.StringWriter;

import java.util.Date;

 

import org.apache.lucene.analysis.Analyzer;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

 

public class ArticleIndexer {

            public static final String INDEX_DIRECTORY = "lucene-index";

 

            private Document createDocument(String article, String author,

                                    String title, String topic, String url, Date dateWritten) {

 

                        // The Document class represents a document in Lucene. We index Document

                        // objects and get Document objects back when we do a search

                        Document document = new Document();

                        document.add(Field.Text("author", author));

                        document.add(Field.Text("title", title));

                        document.add(Field.Text("topic", topic));

                        // The data is stored but not indexed or tokenized. This is used with

                        // data that you want returned with the results of a search but you

                        // won’t actually be searching on this data. In our example, since we

                        // won’t allow searching for the URL there is no reason to index it but

                        // we want it returned to us when a search result is found.

                        document.add(Field.UnIndexed("url", url));

                        // The data is stored and indexed but not tokenized. This is most useful

                        // for data that should be stored unchanged such as a date. In fact, the

                        // Field.Keyword can take a Date object as input

                        document.add(Field.Keyword("date", dateWritten.toString()));

                        // The data is not stored but it is indexed and tokenized. Large amounts

                        // of data such as the text of the article should be placed in the index

                        // unstored.

                        document.add(Field.UnStored("article", article));

                        return document;

            }

 

            private void indexDocument(Document document) throws Exception {

                        // The Analyzer class is an abstract class that used to provide an

                        // interface that will take a Document and turn it into tokens that can

                        // be indexed. There are several useful implementations of this class

                        // but the most commonly used is the StandardAnalyzer class.

                        Analyzer analyzer = new StandardAnalyzer();

 

                        // The IndexWriter class is used to create and maintain indexes, thread

                        // safe. false is to set to append to the existing index only,

                        // but if the index is not existing, if will cause Exception

                        // of Index locked for write

                        IndexWriter writter = new IndexWriter(INDEX_DIRECTORY, analyzer, true);

                        writter.addDocument(document);

                        // To optimize an index, one has to call optimize() on an IndexWriter

                        // instance. When this happens, all in-memory documents are flushed to

                        // the disk and all index segments are merged into a single segment,

                        // reducing the number of files that make up the index. However,

                        // optimizing an index does not help improve indexing performance. As a

                        // matter of fact, optimizing an index during the indexing process will

                        // only slow things down. Despite this, optimizing may sometimes be

                        // necessary in order to keep the number of open files under control.

                        writter.optimize();

                        writter.close();

            }

 

            public void indexArticle(String article, String author, String title,

                                    String topic, String url, Date dateWritten) throws Exception {

                        Document document = createDocument(article, author, title, topic, url,

                                                dateWritten);

                        indexDocument(document);

            }

 

            public String searchDocument(String indexDirectory, String field,

                                    String criteria) throws Exception {

                        String result = "";

                        // The IndexSearcher class is used to search through an index

                        IndexSearcher is = new IndexSearcher(indexDirectory);

                        Analyzer analyzer = new StandardAnalyzer();

                        // The QueryParser class is used to build a parser that can search

                        // through an index.

                        QueryParser parser = new QueryParser(field, analyzer);

                        Query query = parser.parse(criteria);

                        // The Hits class contains the Document objects that are returned by

                        // running the Query object against the index

                        Hits hits = is.search(query);

 

                        StringWriter sw = new StringWriter();

                        if (hits.length() > 0) {

                                    for (int i = 0; i < hits.length(); i++) {

                                                Document doc = hits.doc(i);

                                                String author = doc.get("author");

                                                sw.append(author);

                                                sw.append(", ");

                                    }

                                    is.close();

                                    result = sw.toString();

                        } else {

                                    result = "No item found!";

                        }

 

                        return result;

            }

 

            public static void main(String args[]) {

                        ArticleIndexer indexer = new ArticleIndexer();

 

                        try {

                                    String article = "First time write lucene is just like….";

 

                                    indexer

                                                .indexArticle(

                                                            article,

                                                            "Hongliang Li",

                                                            "First Lucene",

                                                            "No relevant to football",

                                                            "file:///C:/Documents and Settings/hongliang/My Documents/study/Lucene Tutorial.doc",

                                                            new Date());

 

                                    String criteria = "football";

                                    String searchResult = indexer.searchDocument(INDEX_DIRECTORY,

                                                            "topic", criteria);

                                    System.out.println(searchResult);

                        } catch (Exception e) {

                                    System.out.println(e);

                        }

            }

}

Other reference

http://www.onjava.com/pub/a/onjava/2003/03/05/lucene.html?page=2

http://www.darksleep.com/lucene/

 

Advertisements
This entry was posted in Java Utilities. Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s