In [ ]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import string
import re
import demoji
from wordcloud import WordCloud, STOPWORDS
from textblob import TextBlob

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

In [ ]:

tweet_data=pd.read_csv("labeled_data.csv")

In [ ]:

print(tweet_data.shape)

In [ ]:

tweet_data.columns

In [ ]:

tweet_data.head()

In [ ]:

tweet_data.tail()

In [ ]:

tweet_data.sample(5)

In [ ]:

tweet_data=tweet_data.drop(['Unnamed: 0'],axis=1)
tweet_data

In [ ]:

tweet_data.isna().sum()

In [ ]:

tweet_data.duplicated().sum()

In [ ]:

tweet_data.describe()

In [ ]:

data_plot=sns.boxplot(data=tweet_data)

data_plot.set_xticklabels(data_plot.get_xticklabels(), rotation=45)

In [ ]:

for col in tweet_data[['count', 'hate_speech', 'offensive_language', 'neither',
       'class']]:
    sns.histplot(tweet_data[col])
    plt.show()

In [ ]:

tweet_data[tweet_data['hate_speech']>0].describe()

In [ ]:

tweet_data[tweet_data['offensive_language']>0].describe()

In [ ]:

tweet_data[tweet_data['neither']>0].describe()

In [ ]:

tweet_data[tweet_data['class']==0].describe()

In [ ]:

tweet_data[tweet_data['class']==1].describe()

In [ ]:

tweet_data[tweet_data['class']==2].describe()

In [ ]:

sns.stripplot(data=tweet_data, x="hate_speech", y="class")
plt.show()

In [ ]:

sns.stripplot(data=tweet_data, x="offensive_language", y="class")
plt.show()

In [ ]:

sns.stripplot(data=tweet_data, x="neither", y="class")
plt.show()

In [ ]:

# scatter plot hue parameter
sns.scatterplot(x = "hate_speech", y = "offensive_language", data = tweet_data, hue = "class")
plt.title("Scatter Plot for hate_speech vs offensive_language according to their class label")
plt.show()

In [ ]:

tweet_data.groupby(['class']).count()#.apply(lambda x:100 * x / float(x.mean()))

In [ ]:

len(tweet_data[(tweet_data['hate_speech']>0)&(tweet_data['class']==0)])/len(tweet_data)*100

In [ ]:

len(tweet_data[(tweet_data['hate_speech']>0)&(tweet_data['class']==1)])/len(tweet_data)*100

In [ ]:

len(tweet_data[(tweet_data['hate_speech']>0)&(tweet_data['class']==2)])/len(tweet_data)*100

In [ ]:

len(tweet_data[(tweet_data['offensive_language']>0)&(tweet_data['class']==0)])/len(tweet_data)*100

In [ ]:

len(tweet_data[(tweet_data['offensive_language']>0)&(tweet_data['class']==1)])/len(tweet_data)*100

In [ ]:

len(tweet_data[(tweet_data['offensive_language']>0)&(tweet_data['class']==2)])/len(tweet_data)*100

In [ ]:

len(tweet_data[(tweet_data['neither']>0)&(tweet_data['class']==0)])/len(tweet_data)*100

In [ ]:

len(tweet_data[(tweet_data['neither']>0)&(tweet_data['class']==1)])/len(tweet_data)*100

In [ ]:

len(tweet_data[(tweet_data['neither']>0)&(tweet_data['class']==2)])/len(tweet_data)*100

In [ ]:

tweet_data['class'].hist()

In [ ]:

tweet_data["tweet"] = tweet_data["tweet"].astype(str)

In [ ]:

print(string.punctuation)
def remove_punctuation(tweet):
    punctuationfree="".join([i for i in tweet if i not in string.punctuation])
    return punctuationfree

tweet_data['preprocess_tweet']= tweet_data['tweet'].apply(lambda x:remove_punctuation(x))
tweet_data.head()

In [ ]:

tweet_data['preprocess_tweet']= tweet_data['preprocess_tweet'].apply(lambda x: x.lower())
tweet_data.head()

In [ ]:

def tokenization(tweet):
    tokens = re.split('W+',tweet)
    return tokens

tweet_data['preprocess_tweet']= tweet_data['preprocess_tweet'].apply(lambda x: tokenization(x))
tweet_data.head()

In [ ]:

stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[0:10])

def remove_stopwords(Tweet):
    output= [i for i in Tweet if i not in stopwords]
    return output

tweet_data['preprocess_tweet']= tweet_data['preprocess_tweet'].apply(lambda x:remove_stopwords(x))
tweet_data.head()

In [ ]:

porter_stemmer = PorterStemmer()

def stemming(Tweet):
    stem_Tweet = [porter_stemmer.stem(word) for word in Tweet]
    return stem_Tweet

tweet_data['preprocess_tweet']=tweet_data['preprocess_tweet'].apply(lambda x: stemming(x))
tweet_data.head()

In [ ]:

wordnet_lemmatizer = WordNetLemmatizer()

def lemmatizer(Tweet):
    lemm_Tweet = [wordnet_lemmatizer.lemmatize(word) for word in Tweet]
    return lemm_Tweet

tweet_data['preprocess_tweet']=tweet_data['preprocess_tweet'].apply(lambda x:lemmatizer(x))
tweet_data.head()

In [ ]:

def get_sentence(words):
    sentence = ' '.join(words)
    return sentence

tweet_data['preprocess_tweet']=tweet_data['preprocess_tweet'].apply(lambda x: get_sentence(x))
tweet_data.head()

In [ ]:

def remove_emoji(tweet):
    dem = demoji.findall(tweet)
    for item in dem.keys():
        tweet = tweet.replace(item, '')
    return tweet

tweet_data['preprocess_tweet']= tweet_data['preprocess_tweet'].apply(lambda x: remove_emoji(x))
tweet_data.head()

In [ ]:

tweet=" ".join(i for i in tweet_data.preprocess_tweet)
stopwords=set(STOPWORDS)
wordcloud = WordCloud(width = 1000, height = 500,
                background_color ='white',
                stopwords = stopwords, max_words=100,
                min_font_size = 10).generate(tweet)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [ ]:

tweet_data.head()

In [ ]:

tweet_data_hate = tweet_data[tweet_data["class"]==0]
tweet_data_offensive = tweet_data[tweet_data["class"]==1]
tweet_data_neither = tweet_data[tweet_data["class"]==2]

In [ ]:

print(tweet_data_hate.shape)

In [ ]:

tweet=" ".join(i for i in tweet_data_hate.preprocess_tweet)
stopwords=set(STOPWORDS)
wordcloud = WordCloud(width = 1000, height = 500,
                background_color ='white',
                stopwords = stopwords, max_words=100,
                min_font_size = 10).generate(tweet)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [ ]:

tweet=" ".join(i for i in tweet_data_offensive.preprocess_tweet)
stopwords=set(STOPWORDS)
wordcloud = WordCloud(width = 1000, height = 500,
                background_color ='white',
                stopwords = stopwords, max_words=100,
                min_font_size = 10).generate(tweet)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [ ]:

tweet=" ".join(i for i in tweet_data_neither.preprocess_tweet)
stopwords=set(STOPWORDS)
wordcloud = WordCloud(width = 1000, height = 500,
                background_color ='white',
                stopwords = stopwords, max_words=100,
                min_font_size = 10).generate(tweet)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [ ]:

def getPolarity(Tweet):
    return TextBlob(Tweet).sentiment.polarity

tweet_data['polarity']=tweet_data['preprocess_tweet'].apply(getPolarity)
tweet_data.sample(5)

In [ ]:

def getAnalysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
tweet_data['sentiment']=tweet_data['polarity'].apply(getAnalysis)
tweet_data.sample(5)

In [ ]:

sns.set(rc={'figure.figsize':(5,5)})
tweet_data['polarity'].hist()
plt.show()

In [ ]:

# scatter plot hue parameter
sns.scatterplot(x = "hate_speech", y = "polarity", data = tweet_data, hue = "class")
plt.show()

In [ ]:

# scatter plot hue parameter
sns.scatterplot(x = "offensive_language", y = "polarity", data = tweet_data, hue = "class")
plt.show()

In [ ]:

# scatter plot hue parameter
sns.scatterplot(x = "neither", y = "polarity", data = tweet_data, hue = "class")
plt.show()

In [ ]:

tweet_data_negative = tweet_data[tweet_data["sentiment"]=='Negative']
tweet_data_positive = tweet_data[tweet_data["sentiment"]=='Positive']
tweet_data_neutral = tweet_data[tweet_data["sentiment"]=='Neutral']

In [ ]:

def count_values_in_column(data,feature):
    total=data.loc[:,feature].value_counts(dropna=False)
    percentage=round(data.loc[:,feature].value_counts(dropna=False,normalize=True)*100,2)
    return pd.concat([total,percentage],axis=1,keys=["Total","Percentage"])
count_values_in_column(tweet_data,"sentiment")

In [ ]:

plt.figure(figsize=(13, 8), dpi=80)
pichart = count_values_in_column(tweet_data,"sentiment")
names= ["Positive","Neutral","Negative"]
size=pichart["Percentage"]
 
# Create a circle for the center of the plot
my_circle=plt.Circle( (0,0), 0.5, color='white')
plt.pie(size, labels=names, colors=['green','blue','red'])
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.show()

In [ ]:

sns.countplot(data=tweet_data, x="sentiment")
plt.show()

In [ ]:

data_neg = tweet_data_negative['preprocess_tweet']
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1000 , height = 500,
               collocations=False).generate(" ".join(data_neg))
plt.imshow(wc)
plt.show()

In [ ]:

data_pos = tweet_data_positive['preprocess_tweet']
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1000 , height = 500,
               collocations=False).generate(" ".join(data_pos))
plt.imshow(wc)
plt.show()

In [ ]:

def getSubjectivity(Tweet):
    return TextBlob(Tweet).sentiment.subjectivity

In [ ]:

tweet_data['subjectivity']=tweet_data['preprocess_tweet'].apply(getSubjectivity)
tweet_data.head()

In [ ]:

sns.set(rc={'figure.figsize':(5,5)})
tweet_data['subjectivity'].hist()
plt.show()

In [ ]:

# scatter plot hue parameter
sns.scatterplot(x = "hate_speech", y = "subjectivity", data = tweet_data, hue = "class")
plt.show()

In [ ]:

# scatter plot hue parameter
sns.scatterplot(x = "offensive_language", y = "subjectivity", data = tweet_data, hue = "class")
plt.show()

In [ ]:

# scatter plot hue parameter
sns.scatterplot(x = "neither", y = "subjectivity", data = tweet_data, hue = "class")
plt.show()

In [ ]:

tweet_data.to_excel('preprcessed_labeled_data.xlsx',index=False)

In [ ]:

tweet_data = pd.read_excel('preprcessed_labeled_data.xlsx')
print(tweet_data.shape)

In [ ]:

tweet_data.info()

In [ ]:

# calculate the correlation matrix
corr = tweet_data.corr()

# plot the heatmap
sns.heatmap(corr, 
        xticklabels=corr.columns,
        yticklabels=corr.columns)
plt.show()

Machine Learning¶

In [ ]:

X=tweet_data['preprocess_tweet']
Y=tweet_data['class']

In [ ]:

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)

In [ ]:

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [ ]:

vectoriser = CountVectorizer()
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))

In [ ]:

X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)
print(f'Data Transformed.')

In [ ]:

clf = MultinomialNB()
clf.fit(X_train, y_train)

In [ ]:

predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

In [ ]:

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)

In [ ]:

vectoriser = TfidfVectorizer()
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))

In [ ]:

X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)
print(f'Data Transformed.')

In [ ]:

clf = MultinomialNB()
clf.fit(X_train, y_train)

In [ ]:

predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

In [ ]:

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)

In [ ]:

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [ ]:

vectoriser = CountVectorizer()
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))

In [ ]:

X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)
print(f'Data Transformed.')

In [ ]:

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)

In [ ]:

predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

In [ ]:

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)

In [ ]:

vectoriser = TfidfVectorizer()
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))

In [ ]:

X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)
print(f'Data Transformed.')

In [ ]:

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)

In [ ]:

predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

Fine-Tuning¶

In [ ]:

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)

In [ ]:

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [ ]:

vectoriser = CountVectorizer()
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))

In [ ]:

X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)
print(f'Data Transformed.')

In [ ]:

clf = MultinomialNB(alpha=1.0, fit_prior=False, class_prior=None)
clf.fit(X_train, y_train)

In [ ]:

predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

DecisionTreeClassifier¶

In [ ]:

clf = DecisionTreeClassifier(criterion='gini',splitter='best',max_features=None, random_state=0)
clf.fit(X_train, y_train)

In [ ]:

predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

In [ ]:

from sklearn.model_selection import GridSearchCV

max_depth¶

In [ ]:

gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
                             max_depth=None, random_state=0),
             param_grid={'max_depth': list(range(2, 100))},
             verbose=3)

In [ ]:

gs.fit(X_train, y_train)

In [ ]:

gs.best_estimator_

min_samples_split¶

In [ ]:

gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
                             max_depth=65, random_state=0),
             param_grid={'min_samples_split': list(range(2, 10))},
             verbose=3)

In [ ]:

gs.fit(X_train, y_train)

In [ ]:

gs.best_estimator_

In [ ]:

gs.best_params_

In [ ]:

gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
                             max_depth=65, random_state=0),
             param_grid={'min_samples_split': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]},
             verbose=3)

In [ ]:

gs.fit(X_train, y_train)

In [ ]:

gs.best_estimator_

In [ ]:

gs.best_params_

min_samples_leaf¶

In [ ]:

    gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
                                 max_depth=65,min_samples_split=0.1, random_state=0),
             param_grid={'min_samples_leaf': list(range(1, 10))},
             verbose=3)

In [ ]:

gs.fit(X_train, y_train)

In [ ]:

gs.best_params_

In [ ]:

gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
                             max_depth=65,min_samples_split=0.1, random_state=0),
             param_grid={'min_samples_leaf': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]},
             verbose=3)

In [ ]:

gs.fit(X_train, y_train)

In [ ]:

gs.best_params_

max_features¶

In [ ]:

gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
                             max_depth=65, min_samples_split=0.1, 
                             min_samples_leaf=4,random_state=0),
             param_grid={'max_features': list(range(1, 10))},
             verbose=3)

In [ ]:

gs.fit(X_train, y_train)

In [ ]:

gs.best_params_

In [ ]:

gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
                             max_depth=65, min_samples_split=0.1, 
                             min_samples_leaf=4,random_state=0),
             param_grid={'max_features': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]},
             verbose=3)

In [ ]:

gs.fit(X_train, y_train)

In [ ]:

gs.best_params_

max_leaf_nodes¶

In [ ]:

gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
                             max_depth=65, min_samples_split=0.1, 
                             min_samples_leaf=4,max_features=None,random_state=0),
             param_grid={'max_leaf_nodes': list(range(1, 100))},
             verbose=3)

In [ ]:

gs.fit(X_train, y_train)

Out[ ]:

GridSearchCV(estimator=DecisionTreeClassifier(max_depth=65, min_samples_leaf=4,
                                              min_samples_split=0.1,
                                              random_state=0),
             param_grid={'max_leaf_nodes': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
                                            12, 13, 14, 15, 16, 17, 18, 19, 20,
                                            21, 22, 23, 24, 25, 26, 27, 28, 29,
                                            30, ...]},
             verbose=3)

In [ ]:

gs.best_params_

min_impurity_decrease¶

In [ ]:

gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
                             max_depth=65, min_samples_split=0.1, 
                             min_samples_leaf=4,max_features=None,
                                       max_leaf_nodes=63,random_state=0),
             param_grid={'min_impurity_decrease': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]},
             verbose=3)

In [ ]:

gs.fit(X_train, y_train)

In [ ]:

clf.get_params()

In [ ]:

gs.best_params_

In [ ]:

clf = DecisionTreeClassifier(criterion='gini',splitter='best',
                             max_depth=65, min_samples_split=0.1, 
                             min_samples_leaf=4,max_features=None,
                             max_leaf_nodes=None,min_impurity_decrease=0.0,
                             random_state=0)
clf.fit(X_train, y_train)

In [ ]:

predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

Ensemble¶

In [ ]:

from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(DecisionTreeClassifier(
                             random_state=0))
clf.fit(X_train, y_train)

In [ ]:

predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

In [ ]:

from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(DecisionTreeClassifier(criterion='gini',splitter='best',
                             max_depth=65, min_samples_split=0.1, 
                             min_samples_leaf=4,max_features=None,
                             max_leaf_nodes=None,min_impurity_decrease=0.0,
                             random_state=0))
clf.fit(X_train, y_train)

In [ ]:

predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

In [ ]:

from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(
    DecisionTreeClassifier(
    random_state=0))
clf.fit(X_train, y_train)

In [ ]:

predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

In [ ]:

from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(
                             random_state=0))
clf.fit(X_train, y_train)

In [ ]:

predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

In [ ]:

from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(criterion='gini',splitter='best',
                             max_depth=65, min_samples_split=0.1, 
                             min_samples_leaf=4,max_features=None,
                             max_leaf_nodes=None,min_impurity_decrease=0.0,
                             random_state=0))
clf.fit(X_train, y_train)

In [ ]:

predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

In [ ]:

from sklearn.ensemble import VotingClassifier

In [ ]:

clf1 = MultinomialNB(alpha=1.0, fit_prior=False, class_prior=None)
clf2 = DecisionTreeClassifier(random_state=0)
clf3 = DecisionTreeClassifier(criterion='gini',splitter='best',
                             max_depth=65, min_samples_split=0.1, 
                             min_samples_leaf=4,max_features=None,
                             max_leaf_nodes=None,min_impurity_decrease=0.0,
                             random_state=0)
eclf = VotingClassifier(estimators=[('mnb', clf1), ('dt', clf2), ('ft-dt', clf3)],
                        voting='soft', weights=[1, 1, 2])
#
eclf.fit(X_train, y_train)

In [ ]:

predictions=eclf.predict(X_test)
score=eclf.score(X_test,y_test)
print(score*100)

In [ ]:

print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))

In [ ]:

cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()

Programming Tutorials

Search This Blog

Abusive Language Detection Code - 89% accuracy

Machine Learning

Natural Language Processing

Abusive Language Classification - Code

Dataset: Hate Speech and Offensive Language Dataset

Machine Learning¶

Fine-Tuning¶

DecisionTreeClassifier¶

max_depth¶

min_samples_split¶

min_samples_leaf¶

max_features¶

max_leaf_nodes¶

min_impurity_decrease¶

Ensemble¶

Similar Post:

Code Explanation - Abusive Language Detection

Comments

Post a Comment