Stopwords are not useful to us, so we want code to remove them from strings. Usually, there is very little point in searching for the word "the" or "because" or "how" in an input string. We have medium performance requirements, meaning our algorithm must scale, and most importantly must be maintainable and simple.
| When the method receives this | We want to get back this |
| I saw a cat and a horse. | saw cat horse |
| Google owns the Internet. | Google owns Internet |
| Using an extra step to eliminate stopwords | Using extra step eliminate stopwords |
If a word occurs in 95% of documents indexed, it has no value in narrowing the results. Here are some English (en-us) stopwords. What you see right below is the Dictionary, which is declared using LINQ collection initializer syntax. As Microsoft FxCop would tell you, it is best to initialize the static members directly and not after they are declared.
static public class Stopwords
{
static private Dictionary<string, bool> _stopwordsDict =
new Dictionary<string, bool>
{
{ "a", true },
{ "about", true },
{ "above", true },
{ "across", true },
{ "after", true },
{ "afterwards", true },
{ "again", true },
{ "against", true },
{ "all", true },
{ "almost", true },
{ "alone", true },
{ "along", true },
{ "already", true },
{ "also", true },
{ "although", true },
{ "always", true },
{ "am", true },
{ "among", true },
{ "amongst", true },
{ "amoungst", true },
{ "amount", true },
{ "an", true },
{ "and", true },
{ "another", true },
{ "any", true },
{ "anyhow", true },
{ "anyone", true },
{ "anything", true },
{ "anyway", true },
{ "anywhere", true },
{ "are", true },
{ "around", true },
{ "as", true },
{ "at", true },
{ "back", true },
{ "be", true },
{ "became", true },
{ "because", true },
{ "become", true },
{ "becomes", true },
{ "becoming", true },
{ "been", true },
{ "before", true },
{ "beforehand", true },
{ "behind", true },
{ "being", true },
{ "below", true },
{ "beside", true },
{ "besides", true },
{ "between", true },
{ "beyond", true },
{ "bill", true },
{ "both", true },
{ "bottom", true },
{ "but", true },
{ "by", true },
{ "call", true },
{ "can", true },
{ "cannot", true },
{ "cant", true },
{ "co", true },
{ "computer", true },
{ "con", true },
{ "could", true },
{ "couldnt", true },
{ "cry", true },
{ "de", true },
{ "describe", true },
{ "detail", true },
{ "do", true },
{ "done", true },
{ "down", true },
{ "due", true },
{ "during", true },
{ "each", true },
{ "eg", true },
{ "eight", true },
{ "either", true },
{ "eleven", true },
{ "else", true },
{ "elsewhere", true },
{ "empty", true },
{ "enough", true },
{ "etc", true },
{ "even", true },
{ "ever", true },
{ "every", true },
{ "everyone", true },
{ "everything", true },
{ "everywhere", true },
{ "except", true },
{ "few", true },
{ "fifteen", true },
{ "fify", true },
{ "fill", true },
{ "find", true },
{ "fire", true },
{ "first", true },
{ "five", true },
{ "for", true },
{ "former", true },
{ "formerly", true },
{ "forty", true },
{ "found", true },
{ "four", true },
{ "from", true },
{ "front", true },
{ "full", true },
{ "further", true },
{ "get", true },
{ "give", true },
{ "go", true },
{ "had", true },
{ "has", true },
{ "hasnt", true },
{ "have", true },
{ "he", true },
{ "hence", true },
{ "her", true },
{ "here", true },
{ "hereafter", true },
{ "hereby", true },
{ "herein", true },
{ "hereupon", true },
{ "hers", true },
{ "herself", true },
{ "him", true },
{ "himself", true },
{ "his", true },
{ "how", true },
{ "however", true },
{ "hundred", true },
{ "i", true },
{ "ie", true },
{ "if", true },
{ "in", true },
{ "inc", true },
{ "indeed", true },
{ "interest", true },
{ "into", true },
{ "is", true },
{ "it", true },
{ "its", true },
{ "itself", true },
{ "keep", true },
{ "last", true },
{ "latter", true },
{ "latterly", true },
{ "least", true },
{ "less", true },
{ "ltd", true },
{ "made", true },
{ "many", true },
{ "may", true },
{ "me", true },
{ "meanwhile", true },
{ "might", true },
{ "mill", true },
{ "mine", true },
{ "more", true },
{ "moreover", true },
{ "most", true },
{ "mostly", true },
{ "move", true },
{ "much", true },
{ "must", true },
{ "my", true },
{ "myself", true },
{ "name", true },
{ "namely", true },
{ "neither", true },
{ "never", true },
{ "nevertheless", true },
{ "next", true },
{ "nine", true },
{ "no", true },
{ "nobody", true },
{ "none", true },
{ "noone", true },
{ "nor", true },
{ "not", true },
{ "nothing", true },
{ "now", true },
{ "nowhere", true },
{ "of", true },
{ "off", true },
{ "often", true },
{ "on", true },
{ "once", true },
{ "one", true },
{ "only", true },
{ "onto", true },
{ "or", true },
{ "other", true },
{ "others", true },
{ "otherwise", true },
{ "our", true },
{ "ours", true },
{ "ourselves", true },
{ "out", true },
{ "over", true },
{ "own", true },
{ "part", true },
{ "per", true },
{ "perhaps", true },
{ "please", true },
{ "put", true },
{ "rather", true },
{ "re", true },
{ "same", true },
{ "see", true },
{ "seem", true },
{ "seemed", true },
{ "seeming", true },
{ "seems", true },
{ "serious", true },
{ "several", true },
{ "she", true },
{ "should", true },
{ "show", true },
{ "side", true },
{ "since", true },
{ "sincere", true },
{ "six", true },
{ "sixty", true },
{ "so", true },
{ "some", true },
{ "somehow", true },
{ "someone", true },
{ "something", true },
{ "sometime", true },
{ "sometimes", true },
{ "somewhere", true },
{ "still", true },
{ "such", true },
{ "system", true },
{ "take", true },
{ "ten", true },
{ "than", true },
{ "that", true },
{ "the", true },
{ "their", true },
{ "them", true },
{ "themselves", true },
{ "then", true },
{ "thence", true },
{ "there", true },
{ "thereafter", true },
{ "thereby", true },
{ "therefore", true },
{ "therein", true },
{ "thereupon", true },
{ "these", true },
{ "they", true },
{ "thick", true },
{ "thin", true },
{ "third", true },
{ "this", true },
{ "those", true },
{ "though", true },
{ "three", true },
{ "through", true },
{ "throughout", true },
{ "thru", true },
{ "thus", true },
{ "to", true },
{ "together", true },
{ "too", true },
{ "top", true },
{ "toward", true },
{ "towards", true },
{ "twelve", true },
{ "twenty", true },
{ "two", true },
{ "un", true },
{ "under", true },
{ "until", true },
{ "up", true },
{ "upon", true },
{ "us", true },
{ "very", true },
{ "via", true },
{ "was", true },
{ "we", true },
{ "well", true },
{ "were", true },
{ "what", true },
{ "whatever", true },
{ "when", true },
{ "whence", true },
{ "whenever", true },
{ "where", true },
{ "whereafter", true },
{ "whereas", true },
{ "whereby", true },
{ "wherein", true },
{ "whereupon", true },
{ "wherever", true },
{ "whether", true },
{ "which", true },
{ "while", true },
{ "whither", true },
{ "who", true },
{ "whoever", true },
{ "whole", true },
{ "whom", true },
{ "whose", true },
{ "why", true },
{ "will", true },
{ "with", true },
{ "within", true },
{ "without", true },
{ "would", true },
{ "yet", true },
{ "you", true },
{ "your", true },
{ "yours", true },
{ "yourself", true },
{ "yourselves", true }
};
// [Rest of class continues here. RemoveStopwords method should be put here.]
};
The important part is that we declare a static Dictionary to store the actual stopwords and add the items using collection initializer syntax to the Dictionary. This is far more efficient than using arrays or Lists. For some syntax forms, you may need to include System.Linq.
Next, we have the method we call to remove stopwords from a string and then return the string that does not have those stopwords. For added benefit, I remove duplicate words from the input string. Don't use this if you need to preserve a readable string.
static public string RemoveStopwords(string inputValue)
{
var wordsFound = new Dictionary<string, bool>();
StringBuilder builder = new StringBuilder();
// Handle various punctuation characters in our input string.
string[] inputWords = inputValue.Split(new char[] { ' ', ',', ';', '.' },
StringSplitOptions.RemoveEmptyEntries);
foreach (string currentWord in inputWords)
{
string lowerWord = currentWord.ToLower();
// Add this word to the result if it is not a stopword, and if
// it hasn't already occurred.
if (_stopwordsDict.ContainsKey(lowerWord) == false &&
wordsFound.ContainsKey(lowerWord) == false)
{
builder.Append(currentWord).Append(' ');
wordsFound.Add(lowerWord, true);
}
}
return builder.ToString().Trim(); // Remove extra whitespace with Trim method.
}
View and read this code at my source code site, where you can copy it. Using an extra step to eliminate stopwords or duplicate words from your input string can substantially optimize your database or query if it uses a full-text index.
You could read in from disk using the using StreamReader pattern. The static constructor runs automatically, and in my testing it always runs right before it is actually needed, so startup penalties should be very minor.
A similar method I developed removes duplicate words from strings and is useful in situations where you want to remove redundancy but not specific words such as stopwords. Please see the article about removing duplicate words in C#.