/*
SDX: Documentary System in XML.
Copyright (C) 2000, 2001, 2002  Ministere de la culture et de la communication (France), AJLSM

Ministere de la culture et de la communication,
Mission de la recherche et de la technologie
3 rue de Valois, 75042 Paris Cedex 01 (France)
mrt@culture.fr, michel.bottin@culture.fr

AJLSM, 17, rue Vital Carles, 33000 Bordeaux (France)
sevigny@ajlsm.com

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the
Free Software Foundation, Inc.
59 Temple Place - Suite 330, Boston, MA  02111-1307, USA
or connect to:
http://www.fsf.org/copyleft/gpl.html
*/
package fr.gouv.culture.sdx.search.lucene.analysis;

import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.Set;

import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WordlistLoader;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

import fr.gouv.culture.sdx.search.lucene.analysis.filter.BrazilianStemFilter;

/**
 * Analyzer for brazilian language. Supports an external list of stopwords (words that
 * will not be indexed at all) and an external list of exclusions (word that will
 * not be stemmed, but indexed).
 *
 * @author    Jo&atilde;o Kramer
 * @version   $Id: BrazilianAnalyzer.java,v 1.0 2001/02/13 21:29:04
 * @deprecated use {@link org.apache.lucene.analysis.br.BrazilianAnalyzer}
 */
public class Analyzer_br extends DefaultAnalyzer {

	protected final static String ANALYZER_TYPE="Analyzer_br";

    /**
     * List of typical brazilian stopwords.
     */
    public static final String[] BRAZILIAN_STOP_WORDS = {
        "a", "ainda", "alem", "ambas", "ambos", "antes",
        "ao", "aonde", "aos", "apos", "aquele", "aqueles",
        "as", "assim", "com", "como", "contra", "contudo",
        "cuja", "cujas", "cujo", "cujos", "da", "das", "de",
        "dela", "dele", "deles", "demais", "depois", "desde",
        "desta", "deste", "dispoe", "dispoem", "diversa",
        "diversas", "diversos", "do", "dos", "durante", "e",
        "ela", "elas", "ele", "eles", "em", "entao", "entre",
        "essa", "essas", "esse", "esses", "esta", "estas",
        "este", "estes", "ha", "isso", "isto", "logo", "mais",
        "mas", "mediante", "menos", "mesma", "mesmas", "mesmo",
        "mesmos", "na", "nas", "nao", "nas", "nem", "nesse", "neste",
        "nos", "o", "os", "ou", "outra", "outras", "outro", "outros",
        "pelas", "pelas", "pelo", "pelos", "perante", "pois", "por",
        "porque", "portanto", "proprio", "propios", "quais", "qual",
        "qualquer", "quando", "quanto", "que", "quem", "quer", "se",
        "seja", "sem", "sendo", "seu", "seus", "sob", "sobre", "sua",
        "suas", "tal", "tambem", "teu", "teus", "toda", "todas", "todo",
        "todos", "tua", "tuas", "tudo", "um", "uma", "umas", "uns"};


    /**
     * Builds an analyzer with the given stop words.
     * @param stopwords 
     */
    public Analyzer_br(String[] stopwords) {
    	// MAJ Lucene 2.1.0
        // super.stopTable = StopFilter.makeStopTable(stopwords);
    	super.stopTable = StopFilter.makeStopSet(stopwords);
    }

    /**
     * Builds an analyzer with the given stop words.
     * @param stopwords 
     */
    // MAJ Lucene 2.1.0
    //public Analyzer_br(Hashtable stopwords) {
    public Analyzer_br(Set stopwords) {
        super.stopTable = stopwords;
    }

    /**
     * Builds an analyzer with the given stop words.
     * @param stopwords 
     * @throws IOException 
     */
    public Analyzer_br(File stopwords) throws IOException {
    	// MAJ Lucene 2.1.0
        //super.stopTable = WordlistLoader.getWordtable(stopwords);
    	super.stopTable = WordlistLoader.getWordSet(stopwords);
    }

    /**
     * Builds an exclusionlist from an array of Strings.
     * @param exclusionlist 
     */
    public void setStemExclusionTable(String[] exclusionlist) {
    	// MAJ Lucene 2.1.0
        //super.excludeTable = StopFilter.makeStopTable(exclusionlist);
    	super.excludeTable = StopFilter.makeStopSet(exclusionlist);
    }

    /**
     * Builds an exclusionlist from a Hashtable.
     * @param exclusionlist 
     */
    // MAJ Lucene 2.1.0
    //public void setStemExclusionTable(Hashtable exclusionlist) {
    public void setStemExclusionTable(Set exclusionlist) {
        super.excludeTable = exclusionlist;
    }

    /**
     * Builds an exclusionlist from the words contained in the given file.
     * @param exclusionlist 
     * @throws IOException 
     */
    public void setStemExclusionTable(File exclusionlist) throws IOException {
    	// MAJ Lucene 2.1.0
        //super.excludeTable = WordlistLoader.getWordtable(exclusionlist);
    	super.excludeTable = WordlistLoader.getWordSet(exclusionlist);
    }

    /**
     * Creates a TokenStream which tokenizes all the text in the provided Reader.
     *
     * @return  A TokenStream build from a StandardTokenizer filtered with
     * 			StandardFilter, StopFilter, GermanStemFilter and LowerCaseFilter.
     */
    public final TokenStream tokenStream(String fieldName, Reader reader) {
        TokenStream result = new StandardTokenizer(reader);
        result = new StandardFilter(result);
        result = new StopFilter(result, super.stopTable);
        result = new BrazilianStemFilter(result, super.excludeTable);
        // Convert to lowercase after stemming!
        result = new LowerCaseFilter(result);
        return result;
    }


	/**
	 * @see fr.gouv.culture.sdx.search.lucene.analysis.AbstractAnalyzer#getAnalyserType()
	 */
	protected String getAnalyzerType() {
		return Analyzer_br.ANALYZER_TYPE;
	}
	
}
