Machine Learning
Natural Language Processing
Abusive Language Classification - Code
Dataset: Hate Speech and Offensive Language Dataset
In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
import re
import demoji
from wordcloud import WordCloud, STOPWORDS
from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
In [ ]:
tweet_data=pd.read_csv("labeled_data.csv")
In [ ]:
print(tweet_data.shape)
In [ ]:
tweet_data.columns
In [ ]:
tweet_data.head()
In [ ]:
tweet_data.tail()
In [ ]:
tweet_data.sample(5)
In [ ]:
tweet_data=tweet_data.drop(['Unnamed: 0'],axis=1)
tweet_data
In [ ]:
tweet_data.isna().sum()
In [ ]:
tweet_data.duplicated().sum()
In [ ]:
tweet_data.describe()
In [ ]:
data_plot=sns.boxplot(data=tweet_data)
data_plot.set_xticklabels(data_plot.get_xticklabels(), rotation=45)
In [ ]:
for col in tweet_data[['count', 'hate_speech', 'offensive_language', 'neither',
'class']]:
sns.histplot(tweet_data[col])
plt.show()
In [ ]:
tweet_data[tweet_data['hate_speech']>0].describe()
In [ ]:
tweet_data[tweet_data['offensive_language']>0].describe()
In [ ]:
tweet_data[tweet_data['neither']>0].describe()
In [ ]:
tweet_data[tweet_data['class']==0].describe()
In [ ]:
tweet_data[tweet_data['class']==1].describe()
In [ ]:
tweet_data[tweet_data['class']==2].describe()
In [ ]:
sns.stripplot(data=tweet_data, x="hate_speech", y="class")
plt.show()
In [ ]:
sns.stripplot(data=tweet_data, x="offensive_language", y="class")
plt.show()
In [ ]:
sns.stripplot(data=tweet_data, x="neither", y="class")
plt.show()
In [ ]:
# scatter plot hue parameter
sns.scatterplot(x = "hate_speech", y = "offensive_language", data = tweet_data, hue = "class")
plt.title("Scatter Plot for hate_speech vs offensive_language according to their class label")
plt.show()
In [ ]:
tweet_data.groupby(['class']).count()#.apply(lambda x:100 * x / float(x.mean()))
In [ ]:
len(tweet_data[(tweet_data['hate_speech']>0)&(tweet_data['class']==0)])/len(tweet_data)*100
In [ ]:
len(tweet_data[(tweet_data['hate_speech']>0)&(tweet_data['class']==1)])/len(tweet_data)*100
In [ ]:
len(tweet_data[(tweet_data['hate_speech']>0)&(tweet_data['class']==2)])/len(tweet_data)*100
In [ ]:
len(tweet_data[(tweet_data['offensive_language']>0)&(tweet_data['class']==0)])/len(tweet_data)*100
In [ ]:
len(tweet_data[(tweet_data['offensive_language']>0)&(tweet_data['class']==1)])/len(tweet_data)*100
In [ ]:
len(tweet_data[(tweet_data['offensive_language']>0)&(tweet_data['class']==2)])/len(tweet_data)*100
In [ ]:
len(tweet_data[(tweet_data['neither']>0)&(tweet_data['class']==0)])/len(tweet_data)*100
In [ ]:
len(tweet_data[(tweet_data['neither']>0)&(tweet_data['class']==1)])/len(tweet_data)*100
In [ ]:
len(tweet_data[(tweet_data['neither']>0)&(tweet_data['class']==2)])/len(tweet_data)*100
In [ ]:
tweet_data['class'].hist()
In [ ]:
tweet_data["tweet"] = tweet_data["tweet"].astype(str)
In [ ]:
print(string.punctuation)
def remove_punctuation(tweet):
punctuationfree="".join([i for i in tweet if i not in string.punctuation])
return punctuationfree
tweet_data['preprocess_tweet']= tweet_data['tweet'].apply(lambda x:remove_punctuation(x))
tweet_data.head()
In [ ]:
tweet_data['preprocess_tweet']= tweet_data['preprocess_tweet'].apply(lambda x: x.lower())
tweet_data.head()
In [ ]:
def tokenization(tweet):
tokens = re.split('W+',tweet)
return tokens
tweet_data['preprocess_tweet']= tweet_data['preprocess_tweet'].apply(lambda x: tokenization(x))
tweet_data.head()
In [ ]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[0:10])
def remove_stopwords(Tweet):
output= [i for i in Tweet if i not in stopwords]
return output
tweet_data['preprocess_tweet']= tweet_data['preprocess_tweet'].apply(lambda x:remove_stopwords(x))
tweet_data.head()
In [ ]:
porter_stemmer = PorterStemmer()
def stemming(Tweet):
stem_Tweet = [porter_stemmer.stem(word) for word in Tweet]
return stem_Tweet
tweet_data['preprocess_tweet']=tweet_data['preprocess_tweet'].apply(lambda x: stemming(x))
tweet_data.head()
In [ ]:
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(Tweet):
lemm_Tweet = [wordnet_lemmatizer.lemmatize(word) for word in Tweet]
return lemm_Tweet
tweet_data['preprocess_tweet']=tweet_data['preprocess_tweet'].apply(lambda x:lemmatizer(x))
tweet_data.head()
In [ ]:
def get_sentence(words):
sentence = ' '.join(words)
return sentence
tweet_data['preprocess_tweet']=tweet_data['preprocess_tweet'].apply(lambda x: get_sentence(x))
tweet_data.head()
In [ ]:
def remove_emoji(tweet):
dem = demoji.findall(tweet)
for item in dem.keys():
tweet = tweet.replace(item, '')
return tweet
tweet_data['preprocess_tweet']= tweet_data['preprocess_tweet'].apply(lambda x: remove_emoji(x))
tweet_data.head()
In [ ]:
tweet=" ".join(i for i in tweet_data.preprocess_tweet)
stopwords=set(STOPWORDS)
wordcloud = WordCloud(width = 1000, height = 500,
background_color ='white',
stopwords = stopwords, max_words=100,
min_font_size = 10).generate(tweet)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
In [ ]:
tweet_data.head()
In [ ]:
tweet_data_hate = tweet_data[tweet_data["class"]==0]
tweet_data_offensive = tweet_data[tweet_data["class"]==1]
tweet_data_neither = tweet_data[tweet_data["class"]==2]
In [ ]:
print(tweet_data_hate.shape)
In [ ]:
tweet=" ".join(i for i in tweet_data_hate.preprocess_tweet)
stopwords=set(STOPWORDS)
wordcloud = WordCloud(width = 1000, height = 500,
background_color ='white',
stopwords = stopwords, max_words=100,
min_font_size = 10).generate(tweet)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
In [ ]:
tweet=" ".join(i for i in tweet_data_offensive.preprocess_tweet)
stopwords=set(STOPWORDS)
wordcloud = WordCloud(width = 1000, height = 500,
background_color ='white',
stopwords = stopwords, max_words=100,
min_font_size = 10).generate(tweet)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
In [ ]:
tweet=" ".join(i for i in tweet_data_neither.preprocess_tweet)
stopwords=set(STOPWORDS)
wordcloud = WordCloud(width = 1000, height = 500,
background_color ='white',
stopwords = stopwords, max_words=100,
min_font_size = 10).generate(tweet)
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
In [ ]:
def getPolarity(Tweet):
return TextBlob(Tweet).sentiment.polarity
tweet_data['polarity']=tweet_data['preprocess_tweet'].apply(getPolarity)
tweet_data.sample(5)
In [ ]:
def getAnalysis(score):
if score < 0:
return 'Negative'
elif score == 0:
return 'Neutral'
else:
return 'Positive'
tweet_data['sentiment']=tweet_data['polarity'].apply(getAnalysis)
tweet_data.sample(5)
In [ ]:
sns.set(rc={'figure.figsize':(5,5)})
tweet_data['polarity'].hist()
plt.show()
In [ ]:
# scatter plot hue parameter
sns.scatterplot(x = "hate_speech", y = "polarity", data = tweet_data, hue = "class")
plt.show()
In [ ]:
# scatter plot hue parameter
sns.scatterplot(x = "offensive_language", y = "polarity", data = tweet_data, hue = "class")
plt.show()
In [ ]:
# scatter plot hue parameter
sns.scatterplot(x = "neither", y = "polarity", data = tweet_data, hue = "class")
plt.show()
In [ ]:
tweet_data_negative = tweet_data[tweet_data["sentiment"]=='Negative']
tweet_data_positive = tweet_data[tweet_data["sentiment"]=='Positive']
tweet_data_neutral = tweet_data[tweet_data["sentiment"]=='Neutral']
In [ ]:
def count_values_in_column(data,feature):
total=data.loc[:,feature].value_counts(dropna=False)
percentage=round(data.loc[:,feature].value_counts(dropna=False,normalize=True)*100,2)
return pd.concat([total,percentage],axis=1,keys=["Total","Percentage"])
count_values_in_column(tweet_data,"sentiment")
In [ ]:
plt.figure(figsize=(13, 8), dpi=80)
pichart = count_values_in_column(tweet_data,"sentiment")
names= ["Positive","Neutral","Negative"]
size=pichart["Percentage"]
# Create a circle for the center of the plot
my_circle=plt.Circle( (0,0), 0.5, color='white')
plt.pie(size, labels=names, colors=['green','blue','red'])
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.show()
In [ ]:
sns.countplot(data=tweet_data, x="sentiment")
plt.show()
In [ ]:
data_neg = tweet_data_negative['preprocess_tweet']
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1000 , height = 500,
collocations=False).generate(" ".join(data_neg))
plt.imshow(wc)
plt.show()
In [ ]:
data_pos = tweet_data_positive['preprocess_tweet']
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 1000 , width = 1000 , height = 500,
collocations=False).generate(" ".join(data_pos))
plt.imshow(wc)
plt.show()
In [ ]:
def getSubjectivity(Tweet):
return TextBlob(Tweet).sentiment.subjectivity
In [ ]:
tweet_data['subjectivity']=tweet_data['preprocess_tweet'].apply(getSubjectivity)
tweet_data.head()
In [ ]:
sns.set(rc={'figure.figsize':(5,5)})
tweet_data['subjectivity'].hist()
plt.show()
In [ ]:
# scatter plot hue parameter
sns.scatterplot(x = "hate_speech", y = "subjectivity", data = tweet_data, hue = "class")
plt.show()
In [ ]:
# scatter plot hue parameter
sns.scatterplot(x = "offensive_language", y = "subjectivity", data = tweet_data, hue = "class")
plt.show()
In [ ]:
# scatter plot hue parameter
sns.scatterplot(x = "neither", y = "subjectivity", data = tweet_data, hue = "class")
plt.show()
In [ ]:
tweet_data.to_excel('preprcessed_labeled_data.xlsx',index=False)
In [ ]:
tweet_data = pd.read_excel('preprcessed_labeled_data.xlsx')
print(tweet_data.shape)
In [ ]:
tweet_data.info()
In [ ]:
# calculate the correlation matrix
corr = tweet_data.corr()
# plot the heatmap
sns.heatmap(corr,
xticklabels=corr.columns,
yticklabels=corr.columns)
plt.show()
Machine Learning¶
In [ ]:
X=tweet_data['preprocess_tweet']
Y=tweet_data['class']
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)
In [ ]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
In [ ]:
vectoriser = CountVectorizer()
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))
In [ ]:
X_train = vectoriser.transform(X_train)
X_test = vectoriser.transform(X_test)
print(f'Data Transformed.')
In [ ]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
In [ ]:
predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)
In [ ]:
vectoriser = TfidfVectorizer()
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))
In [ ]:
X_train = vectoriser.transform(X_train)
X_test = vectoriser.transform(X_test)
print(f'Data Transformed.')
In [ ]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
In [ ]:
predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)
In [ ]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
In [ ]:
vectoriser = CountVectorizer()
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))
In [ ]:
X_train = vectoriser.transform(X_train)
X_test = vectoriser.transform(X_test)
print(f'Data Transformed.')
In [ ]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
In [ ]:
predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)
In [ ]:
vectoriser = TfidfVectorizer()
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))
In [ ]:
X_train = vectoriser.transform(X_train)
X_test = vectoriser.transform(X_test)
print(f'Data Transformed.')
In [ ]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
In [ ]:
predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
Fine-Tuning¶
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)
In [ ]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
In [ ]:
vectoriser = CountVectorizer()
vectoriser.fit(X_train)
print(f'Vectoriser fitted.')
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))
In [ ]:
X_train = vectoriser.transform(X_train)
X_test = vectoriser.transform(X_test)
print(f'Data Transformed.')
In [ ]:
clf = MultinomialNB(alpha=1.0, fit_prior=False, class_prior=None)
clf.fit(X_train, y_train)
In [ ]:
predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
DecisionTreeClassifier¶
In [ ]:
clf = DecisionTreeClassifier(criterion='gini',splitter='best',max_features=None, random_state=0)
clf.fit(X_train, y_train)
In [ ]:
predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
In [ ]:
from sklearn.model_selection import GridSearchCV
max_depth¶
In [ ]:
gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=None, random_state=0),
param_grid={'max_depth': list(range(2, 100))},
verbose=3)
In [ ]:
gs.fit(X_train, y_train)
In [ ]:
gs.best_estimator_
min_samples_split¶
In [ ]:
gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=65, random_state=0),
param_grid={'min_samples_split': list(range(2, 10))},
verbose=3)
In [ ]:
gs.fit(X_train, y_train)
In [ ]:
gs.best_estimator_
In [ ]:
gs.best_params_
In [ ]:
gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=65, random_state=0),
param_grid={'min_samples_split': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]},
verbose=3)
In [ ]:
gs.fit(X_train, y_train)
In [ ]:
gs.best_estimator_
In [ ]:
gs.best_params_
min_samples_leaf¶
In [ ]:
gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=65,min_samples_split=0.1, random_state=0),
param_grid={'min_samples_leaf': list(range(1, 10))},
verbose=3)
In [ ]:
gs.fit(X_train, y_train)
In [ ]:
gs.best_params_
In [ ]:
gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=65,min_samples_split=0.1, random_state=0),
param_grid={'min_samples_leaf': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]},
verbose=3)
In [ ]:
gs.fit(X_train, y_train)
In [ ]:
gs.best_params_
max_features¶
In [ ]:
gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=65, min_samples_split=0.1,
min_samples_leaf=4,random_state=0),
param_grid={'max_features': list(range(1, 10))},
verbose=3)
In [ ]:
gs.fit(X_train, y_train)
In [ ]:
gs.best_params_
In [ ]:
gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=65, min_samples_split=0.1,
min_samples_leaf=4,random_state=0),
param_grid={'max_features': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]},
verbose=3)
In [ ]:
gs.fit(X_train, y_train)
In [ ]:
gs.best_params_
max_leaf_nodes¶
In [ ]:
gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=65, min_samples_split=0.1,
min_samples_leaf=4,max_features=None,random_state=0),
param_grid={'max_leaf_nodes': list(range(1, 100))},
verbose=3)
In [ ]:
gs.fit(X_train, y_train)
Out[ ]:
GridSearchCV(estimator=DecisionTreeClassifier(max_depth=65, min_samples_leaf=4, min_samples_split=0.1, random_state=0), param_grid={'max_leaf_nodes': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, ...]}, verbose=3)
In [ ]:
gs.best_params_
min_impurity_decrease¶
In [ ]:
gs=GridSearchCV(DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=65, min_samples_split=0.1,
min_samples_leaf=4,max_features=None,
max_leaf_nodes=63,random_state=0),
param_grid={'min_impurity_decrease': [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]},
verbose=3)
In [ ]:
gs.fit(X_train, y_train)
In [ ]:
clf.get_params()
In [ ]:
gs.best_params_
In [ ]:
clf = DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=65, min_samples_split=0.1,
min_samples_leaf=4,max_features=None,
max_leaf_nodes=None,min_impurity_decrease=0.0,
random_state=0)
clf.fit(X_train, y_train)
In [ ]:
predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
Ensemble¶
In [ ]:
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(DecisionTreeClassifier(
random_state=0))
clf.fit(X_train, y_train)
In [ ]:
predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
In [ ]:
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=65, min_samples_split=0.1,
min_samples_leaf=4,max_features=None,
max_leaf_nodes=None,min_impurity_decrease=0.0,
random_state=0))
clf.fit(X_train, y_train)
In [ ]:
predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
In [ ]:
from sklearn.ensemble import BaggingClassifier
clf = BaggingClassifier(
DecisionTreeClassifier(
random_state=0))
clf.fit(X_train, y_train)
In [ ]:
predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
In [ ]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(
base_estimator=DecisionTreeClassifier(
random_state=0))
clf.fit(X_train, y_train)
In [ ]:
predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
In [ ]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(
base_estimator=DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=65, min_samples_split=0.1,
min_samples_leaf=4,max_features=None,
max_leaf_nodes=None,min_impurity_decrease=0.0,
random_state=0))
clf.fit(X_train, y_train)
In [ ]:
predictions=clf.predict(X_test)
score=clf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
In [ ]:
from sklearn.ensemble import VotingClassifier
In [ ]:
clf1 = MultinomialNB(alpha=1.0, fit_prior=False, class_prior=None)
clf2 = DecisionTreeClassifier(random_state=0)
clf3 = DecisionTreeClassifier(criterion='gini',splitter='best',
max_depth=65, min_samples_split=0.1,
min_samples_leaf=4,max_features=None,
max_leaf_nodes=None,min_impurity_decrease=0.0,
random_state=0)
eclf = VotingClassifier(estimators=[('mnb', clf1), ('dt', clf2), ('ft-dt', clf3)],
voting='soft', weights=[1, 1, 2])
#
eclf.fit(X_train, y_train)
In [ ]:
predictions=eclf.predict(X_test)
score=eclf.score(X_test,y_test)
print(score*100)
In [ ]:
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print(metrics.classification_report(y_test,predictions))
In [ ]:
cm=metrics.confusion_matrix(y_test,predictions)
print(cm)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap="Blues_r");
plt.ylabel("Actual label");
plt.xlabel("Predicted label")
all_sample_title="Accuracy score: {0}".format(score)
plt.title(all_sample_title,size=15)
plt.show()
Comments
Post a Comment