package src.java.org.apache.lucene.analysis.snowball;
/* ====================================================================
* The Apache Software License, Version 1.1
*
* Copyright (c) 2001 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.de.WordlistLoader;
import java.io.Reader;
import java.util.Hashtable;
import java.io.File;
/** Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
* LowerCaseFilter}, {@link StopFilter} and {@link SpanishStemFilter}. */
/**
* Analyzer for Spanish using the SNOWBALL stemmer. Supports an external list of stopwords
* (words that will not be indexed at all) and an external list of exclusions (word that will
* not be stemmed, but indexed) as in German package from Gerhard Schwarz.
* A default set of stopwords is used unless an alternative list is specified, the
* exclusion list is empty by default.
*
* @author Alex Murzaku (alex at lissus.com)
*/
public class SpanishAnalyzer extends Analyzer {
/** An array containing some common Spanish words that are usually not
* useful for searching. Imported from http://www.unine.ch/info/clef/.
*/
public static final String[] SPANISH_STOP_WORDS = {
"a",
"ac",
"ah",
"ajena",
"ajenas",
"ajeno",
"ajenos",
"al",
"algo",
"alguna",
"algunas",
"alguno",
"algunos",
"algn",
"all",
"all",
"aquel",
"aquella",
"aquellas",
"aquello",
"aquellos",
"aqu",
"cada",
"cierta",
"ciertas",
"cierto",
"ciertos",
"como",
"con",
"conmigo",
"consigo",
"contigo",
"cualquier",
"cualquiera",
"cualquieras",
"cuan",
"cuanta",
"cuantas",
"cuanto",
"cuantos",
"cun",
"cunta",
"cuntas",
"cunto",
"cuntos",
"cmo",
"de",
"dejar",
"del",
"demasiada",
"demasiadas",
"demasiado",
"demasiados",
"dems",
"el",
"ella",
"ellas",
"ellos",
"esa",
"esas",
"ese",
"esos",
"esta",
"estar",
"estas",
"este",
"estos",
"hacer",
"hasta",
"jams",
"junto",
"juntos",
"la",
"las",
"lo",
"los",
"mas",
"me",
"menos",
"mientras",
"misma",
"mismas",
"mismo",
"mismos",
"mucha",
"muchas",
"mucho",
"muchos",
"muchsima",
"muchsimas",
"muchsimo",
"muchsimos",
"muy",
"ms",
"ma",
"mo",
"nada",
"ni",
"ninguna",
"ningunas",
"ninguno",
"ningunos",
"no",
"nos",
"nosotras",
"nosotros",
"nuestra",
"nuestras",
"nuestro",
"nuestros",
"nunca",
"os",
"otra",
"otras",
"otro",
"otros",
"para",
"parecer",
"poca",
"pocas",
"poco",
"pocos",
"por",
"porque",
"que",
"querer",
"quien",
"quienes",
"quienesquiera",
"quienquiera",
"quin",
"qu",
"ser",
"si",
"siempre",
"sr",
"sra",
"sres",
"sta",
"suya",
"suyas",
"suyo",
"suyos",
"s",
"sn",
"tal",
"tales",
"tan",
"tanta",
"tantas",
"tanto",
"tantos",
"te",
"tener",
"ti",
"toda",
"todas",
"todo",
"todos",
"tomar",
"tuya",
"tuyo",
"t",
"un",
"una",
"unas",
"unos",
"usted",
"ustedes",
"varias",
"varios",
"vosotras",
"vosotros",
"vuestra",
"vuestras",
"vuestro",
"vuestros",
"y",
"yo",
"l"
};
/**
* Contains the stopwords used with the StopFilter.
*/
private Hashtable stopTable = new Hashtable();
/**
* Contains words that should be indexed but not stemmed.
*/
private Hashtable exclTable = new Hashtable();
/** Builds an analyzer with the given stop words. */
public SpanishAnalyzer(String[] stopWords) {
stopTable = StopFilter.makeStopTable(stopWords);
}
/**
* Builds an analyzer.
*/
public SpanishAnalyzer() {
stopTable = StopFilter.makeStopTable(SPANISH_STOP_WORDS);
}
/**
* Builds an analyzer with the given stop words from file.
*/
public SpanishAnalyzer(File stopWords) {
stopTable = WordlistLoader.getWordtable(stopWords);
}
/**
* Builds an exclusionlist from an array of Strings.
*/
public void setStemExclusionTable( String[] exclusionList ) {
exclTable = StopFilter.makeStopTable( exclusionList );
}
/**
* Builds an exclusionlist from a Hashtable.
*/
public void setStemExclusionTable(Hashtable exclusionList) {
exclTable = exclusionList;
}
/**
* Builds an exclusionlist from the words contained in the given file.
*/
public void setStemExclusionTable(File exclusionList) {
exclTable = WordlistLoader.getWordtable(exclusionList);
}
/** Constructs a {@link StandardTokenizer} filtered by a {@link
* StandardFilter}, a {@link LowerCaseFilter}, a {@link StopFilter}
* and a {@link SpanishStemFilter}. */
public final TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopTable);
result = new SpanishStemFilter(result);
// result = new SpanishStemFilter(result, exclTable);
return result;
}
}