take a raw text for pre processing
text= 'India is a great place to live, is it really ?Democracy is sham.@modi,'
once tokenised , basic steps involved in preprocessing-
lower casing
removing stop words and punctuations
stemming
there might be need of removing some words like #, @ etc that can be done using python's re library.
from nltk.tokenize import TweetTokenizer # module for tokenizing strings
tweet_tokens = tokenizer.tokenize(tweet2) # function in TweetTokenizer
print(tweet_tokens)
lower casing
text= [x.lower() for x in text]
removing stop words and punctuations
from nltk.corpus import stopwords
#Import the english stop words list from NLTK
stopwords_english = stopwords.words('english')
tweets_clean = []
for word in tweet_tokens: # Go through every word in your tokens list
if (word not in stopwords_english and # remove stopwords
word not in string.punctuation): # remove punctuation
tweets_clean.append(word)
print('removed stop words and punctuation:')
print(tweets_clean)
stemming-
from nltk.stem import PorterStemmer
# Instantiate stemming class
stemmer = PorterStemmer()
# Create an empty list to store the stems
tweets_stem = []
for word in tweets_clean:
stem_word = stemmer.stem(word) # stemming word
tweets_stem.append(stem_word) # append to the list
print('stemmed words:')
print(tweets_stem)
Comments