#ifndef _lucene_search_highlight_highlighter_ #define _lucene_search_highlight_highlighter_ /** * Copyright 2002-2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include "Formatter.h" #include "SimpleHTMLFormatter.h" #include "Fragmenter.h" #include "SimpleFragmenter.h" #include "Scorer.h" #include "TextFragment.h" #undef max using lucene::util::StringBuffer; using lucene::util::PriorityQueue; namespace lucene { namespace search { namespace highlight { /** * Class used to markup highlighted terms found in the best sections of a * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter} * and tokenizers. * @author mark@searcharea.co.uk */ class Highlighter { public: const static int_t DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024; private: int_t maxDocBytesToAnalyze; Formatter * _formatter; Fragmenter * _textFragmenter; Scorer * _fragmentScorer; public: /** * Constructs a Highlighter object with the provided scorer. The scorer object is owned * by the Highlighter object, and it will freed in the destructor. */ Highlighter(Scorer * fragmentScorer); /** * Constructs a Highlighter object with the provided formatter and scorer. The scorer and * the formatter objects are owned by the Highlighter object, and it will free them * in the destructor. */ Highlighter(Formatter * formatter, Scorer * fragmentScorer); Highlighter(Formatter * formatter, Scorer * fragmentScorer, Fragmenter * textFragmenter); /** * Destructor for Highlighter. It deletes the owned scorer, formatter and textFragmenter. */ ~Highlighter(); /** * Highlights chosen terms in a text, extracting the most relevant section. * The document text is analysed in chunks to record hit statistics * across the document. After accumulating stats, the fragment with the highest score * is returned * * @param tokenStream a stream of tokens identified in the text parameter, including offset information. * This is typically produced by an analyzer re-parsing a document's * text. Some work may be done on retrieving TokenStreams more efficently * by adding support for storing original text position data in the Lucene * index but this support is not currently available (as of Lucene 1.4 rc2). * @param text text to highlight terms in * * @return highlighted text fragment or null if no terms found */ char_t * getBestFragment(TokenStream * tokenStream, const char_t * text); /** * Highlights chosen terms in a text, extracting the most relevant sections. * The document text is analysed in chunks to record hit statistics * across the document. After accumulating stats, the fragments with the highest scores * are returned as an array of strings in order of score (contiguous fragments are merged into * one in their original order to improve readability) * * @param text text to highlight terms in * @param maxNumFragments the maximum number of fragments. * * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) */ StringArray * getBestFragments( TokenStream * tokenStream, const char_t * text, int_t maxNumFragments); private: /** * Low level api to get the most relevant sections of the document * @param tokenStream * @param text * @param maxNumFragments * @return * @throws IOException */ TextFragmentList * getBestDocFragments( TokenStream * tokenStream, const char_t * text, StringBuffer * newText, int_t maxNumFragments); /** Improves readability of a score-sorted list of TextFragments by merging any fragments * that were contiguous in the original text into one larger fragment with the correct order. * This will leave a "null" in the array entry for the lesser scored fragment. * * @param frag An array of document fragments in descending score */ void mergeContiguousFragments(TextFragmentList * frag); public: /** * Highlights terms in the text , extracting the most relevant sections * and concatenating the chosen fragments with a separator (typically "..."). * The document text is analysed in chunks to record hit statistics * across the document. After accumulating stats, the fragments with the highest scores * are returned in order as "separator" delimited strings. * * @param text text to highlight terms in * @param maxNumFragments the maximum number of fragments. * @param separator the separator used to intersperse the document fragments (typically "...") * * @return highlighted text */ char_t * getBestFragments( TokenStream * tokenStream, const char_t * text, int_t maxNumFragments, const char_t * separator); /** * @return the maximum number of bytes to be tokenized per doc */ int_t getMaxDocBytesToAnalyze() { return maxDocBytesToAnalyze; } /** * @param byteCount the maximum number of bytes to be tokenized per doc * (This can improve performance with large documents) */ void setMaxDocBytesToAnalyze(int_t byteCount) { maxDocBytesToAnalyze = byteCount; } /** * @return */ Fragmenter * getTextFragmenter() { return _textFragmenter; } /** * @param fragmenter */ void setTextFragmenter(Fragmenter * fragmenter) { _textFragmenter = fragmenter; } /** * @return Object used to score each text fragment */ Scorer * getFragmentScorer() { return _fragmentScorer; } /** * @param scorer */ void setFragmentScorer(Scorer * scorer) { _fragmentScorer = scorer; } }; }}} #endif