NLTK package provides a method show_most_informative_features()
to find the most important features for both class, with output like:
contai
You can get the same with two classes on the left and right side:
precision recall f1-score support
Irrelevant 0.77 0.98 0.86 129
Relevant 0.78 0.15 0.25 46
avg / total 0.77 0.77 0.70 175
-1.3914 davis 1.4809 austin
-1.1023 suicide 1.0695 march
-1.0609 arrested 1.0379 call
-1.0145 miller 1.0152 tsa
-0.8902 packers 0.9848 passengers
-0.8370 train 0.9547 pensacola
-0.7557 trevor 0.7432 bag
-0.7457 near 0.7056 conditt
-0.7359 military 0.7002 midamerica
-0.7302 berlin 0.6987 mark
-0.6880 april 0.6799 grenade
-0.6581 plane 0.6357 suspicious
-0.6351 disposal 0.6348 death
-0.5804 wwii 0.6053 flight
-0.5723 terminal 0.5745 marabi
def Show_most_informative_features(vectorizer, clf, n=20):
feature_names = vectorizer.get_feature_names()
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top:
print ("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
In the case of binary classification, it seems like the coefficient array has been flatten.
Let's try to relabel our data with only two labels:
import codecs, re, time
from itertools import chain
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
trainfile = 'train.txt'
# Vectorizing data.
train = []
word_vectorizer = CountVectorizer(analyzer='word')
trainset = word_vectorizer.fit_transform(codecs.open(trainfile,'r','utf8'))
tags = ['bs','pt','bs','pt']
# Training NB
mnb = MultinomialNB()
mnb.fit(trainset, tags)
print mnb.classes_
print mnb.coef_[0]
print mnb.coef_[1]
[out]:
['bs' 'pt']
[-5.55682806 -4.86368088 -4.86368088 -5.55682806 -5.55682806 -5.55682806
-4.86368088 -4.86368088 -5.55682806 -5.55682806 -4.86368088 -4.86368088
-4.1705337 -5.55682806 -4.86368088 -5.55682806 -4.86368088 -5.55682806
-5.55682806 -5.55682806 -4.86368088 -4.45821577 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -4.86368088 -5.55682806 -5.55682806
-5.55682806 -5.55682806 -5.55682806 -4.45821577 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-4.86368088 -5.55682806 -5.55682806 -5.55682806 -5.55682806 -5.55682806
-5.55682806 -5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -5.55682806 -5.55682806 -4.86368088 -5.55682806 -4.86368088
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.45821577 -4.86368088
-4.86368088 -4.45821577 -4.86368088 -4.86368088 -4.86368088 -5.55682806
-4.86368088 -5.55682806 -5.55682806 -4.86368088 -5.55682806 -5.55682806
-4.86368088 -5.55682806 -4.86368088 -4.86368088 -4.86368088 -5.55682806
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-5.55682806 -4.86368088 -5.55682806 -4.86368088 -5.55682806 -5.55682806
-5.55682806 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-4.86368088 -4.1705337 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088 -5.55682806
-4.86368088 -4.45821577 -4.86368088 -4.86368088]
Traceback (most recent call last):
File "test.py", line 24, in <module>
print mnb.coef_[1]
IndexError: index 1 is out of bounds for axis 0 with size 1
So let's do some diagnostics:
print mnb.feature_count_
print mnb.coef_[0]
[out]:
[[ 1. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 1.
1. 1. 2. 2. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 0. 2. 1.
1. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0.
0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 0. 1. 0.
1. 2. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 1.
0. 1. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1.
1. 0. 0. 1. 0. 0. 0. 4. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0.
0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0.
0. 0. 1. 0. 0. 1. 0. 0. 0. 0.]
[ 0. 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 1. 3. 0. 1. 0. 1. 0.
0. 0. 1. 2. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 0. 0.
0. 0. 0. 2. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 0. 1. 1.
1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0. 1. 0. 1.
0. 0. 1. 1. 2. 1. 1. 2. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0.
1. 0. 1. 1. 1. 0. 0. 0. 1. 1. 0. 1. 0. 1. 0. 1. 0. 0.
0. 1. 1. 0. 1. 1. 1. 3. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1.
1. 1. 0. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 1. 1. 1.
1. 1. 0. 1. 1. 0. 1. 2. 1. 1.]]
[-5.55682806 -4.86368088 -4.86368088 -5.55682806 -5.55682806 -5.55682806
-4.86368088 -4.86368088 -5.55682806 -5.55682806 -4.86368088 -4.86368088
-4.1705337 -5.55682806 -4.86368088 -5.55682806 -4.86368088 -5.55682806
-5.55682806 -5.55682806 -4.86368088 -4.45821577 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -4.86368088 -5.55682806 -5.55682806
-5.55682806 -5.55682806 -5.55682806 -4.45821577 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-4.86368088 -5.55682806 -5.55682806 -5.55682806 -5.55682806 -5.55682806
-5.55682806 -5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -5.55682806 -5.55682806 -4.86368088 -5.55682806 -4.86368088
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.45821577 -4.86368088
-4.86368088 -4.45821577 -4.86368088 -4.86368088 -4.86368088 -5.55682806
-4.86368088 -5.55682806 -5.55682806 -4.86368088 -5.55682806 -5.55682806
-4.86368088 -5.55682806 -4.86368088 -4.86368088 -4.86368088 -5.55682806
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-5.55682806 -4.86368088 -5.55682806 -4.86368088 -5.55682806 -5.55682806
-5.55682806 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-4.86368088 -4.1705337 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088
-5.55682806 -5.55682806 -4.86368088 -4.86368088 -4.86368088 -4.86368088
-4.86368088 -4.86368088 -5.55682806 -4.86368088 -4.86368088 -5.55682806
-4.86368088 -4.45821577 -4.86368088 -4.86368088]
Seems like the features are counted and then when vectorized it was flattened to save memory, so let's try:
index = 0
coef_features_c1_c2 = []
for feat, c1, c2 in zip(word_vectorizer.get_feature_names(), mnb.feature_count_[0], mnb.feature_count_[1]):
coef_features_c1_c2.append(tuple([mnb.coef_[0][index], feat, c1, c2]))
index+=1
for i in sorted(coef_features_c1_c2):
print i
[out]:
(-5.5568280616995374, u'acuerdo', 1.0, 0.0)
(-5.5568280616995374, u'al', 1.0, 0.0)
(-5.5568280616995374, u'alex', 1.0, 0.0)
(-5.5568280616995374, u'algo', 1.0, 0.0)
(-5.5568280616995374, u'andaba', 1.0, 0.0)
(-5.5568280616995374, u'andrea', 1.0, 0.0)
(-5.5568280616995374, u'bien', 1.0, 0.0)
(-5.5568280616995374, u'buscando', 1.0, 0.0)
(-5.5568280616995374, u'como', 1.0, 0.0)
(-5.5568280616995374, u'con', 1.0, 0.0)
(-5.5568280616995374, u'conseguido', 1.0, 0.0)
(-5.5568280616995374, u'distancia', 1.0, 0.0)
(-5.5568280616995374, u'doprinese', 1.0, 0.0)
(-5.5568280616995374, u'es', 2.0, 0.0)
(-5.5568280616995374, u'est\xe1', 1.0, 0.0)
(-5.5568280616995374, u'eulex', 1.0, 0.0)
(-5.5568280616995374, u'excusa', 1.0, 0.0)
(-5.5568280616995374, u'fama', 1.0, 0.0)
(-5.5568280616995374, u'guasch', 1.0, 0.0)
(-5.5568280616995374, u'ha', 1.0, 0.0)
(-5.5568280616995374, u'incident', 1.0, 0.0)
(-5.5568280616995374, u'ispit', 1.0, 0.0)
(-5.5568280616995374, u'istragu', 1.0, 0.0)
(-5.5568280616995374, u'izbijanju', 1.0, 0.0)
(-5.5568280616995374, u'ja\u010danju', 1.0, 0.0)
(-5.5568280616995374, u'je', 1.0, 0.0)
(-5.5568280616995374, u'jedan', 1.0, 0.0)
(-5.5568280616995374, u'jo\u0161', 1.0, 0.0)
(-5.5568280616995374, u'kapaciteta', 1.0, 0.0)
(-5.5568280616995374, u'kosova', 1.0, 0.0)
(-5.5568280616995374, u'la', 1.0, 0.0)
(-5.5568280616995374, u'lequio', 1.0, 0.0)
(-5.5568280616995374, u'llevar', 1.0, 0.0)
(-5.5568280616995374, u'lo', 2.0, 0.0)
(-5.5568280616995374, u'misije', 1.0, 0.0)
(-5.5568280616995374, u'muy', 1.0, 0.0)
(-5.5568280616995374, u'm\xe1s', 1.0, 0.0)
(-5.5568280616995374, u'na', 1.0, 0.0)
(-5.5568280616995374, u'nada', 1.0, 0.0)
(-5.5568280616995374, u'nasilja', 1.0, 0.0)
(-5.5568280616995374, u'no', 1.0, 0.0)
(-5.5568280616995374, u'obaviti', 1.0, 0.0)
(-5.5568280616995374, u'obe\u0107ao', 1.0, 0.0)
(-5.5568280616995374, u'parecer', 1.0, 0.0)
(-5.5568280616995374, u'pone', 1.0, 0.0)
(-5.5568280616995374, u'por', 1.0, 0.0)
(-5.5568280616995374, u'po\u0161to', 1.0, 0.0)
(-5.5568280616995374, u'prava', 1.0, 0.0)
(-5.5568280616995374, u'predstavlja', 1.0, 0.0)
(-5.5568280616995374, u'pro\u0161losedmi\u010dnom', 1.0, 0.0)
(-5.5568280616995374, u'relaci\xf3n', 1.0, 0.0)
(-5.5568280616995374, u'sjeveru', 1.0, 0.0)
(-5.5568280616995374, u'taj', 1.0, 0.0)
(-5.5568280616995374, u'una', 1.0, 0.0)
(-5.5568280616995374, u'visto', 1.0, 0.0)
(-5.5568280616995374, u'vladavine', 1.0, 0.0)
(-5.5568280616995374, u'ya', 1.0, 0.0)
(-5.5568280616995374, u'\u0107e', 1.0, 0.0)
(-4.863680881139592, u'aj', 0.0, 1.0)
(-4.863680881139592, u'ajudou', 0.0, 1.0)
(-4.863680881139592, u'alpsk\xfdmi', 0.0, 1.0)
(-4.863680881139592, u'alpy', 0.0, 1.0)
(-4.863680881139592, u'ao', 0.0, 1.0)
(-4.863680881139592, u'apresenta', 0.0, 1.0)
(-4.863680881139592, u'bl\xedzko', 0.0, 1.0)
(-4.863680881139592, u'come\xe7o', 0.0, 1.0)
(-4.863680881139592, u'da', 2.0, 1.0)
(-4.863680881139592, u'decepcionantes', 0.0, 1.0)
(-4.863680881139592, u'deti', 0.0, 1.0)
(-4.863680881139592, u'dificuldades', 0.0, 1.0)
(-4.863680881139592, u'dif\xedcil', 1.0, 1.0)
(-4.863680881139592, u'do', 0.0, 1.0)
(-4.863680881139592, u'druh', 0.0, 1.0)
(-4.863680881139592, u'd\xe1', 0.0, 1.0)
(-4.863680881139592, u'ela', 0.0, 1.0)
(-4.863680881139592, u'encontrar', 0.0, 1.0)
(-4.863680881139592, u'enfrentar', 0.0, 1.0)
(-4.863680881139592, u'for\xe7as', 0.0, 1.0)
(-4.863680881139592, u'furiosa', 0.0, 1.0)
(-4.863680881139592, u'golf', 0.0, 1.0)
(-4.863680881139592, u'golfistami', 0.0, 1.0)
(-4.863680881139592, u'golfov\xfdch', 0.0, 1.0)
(-4.863680881139592, u'hotelmi', 0.0, 1.0)
(-4.863680881139592, u'hra\u0165', 0.0, 1.0)
(-4.863680881139592, u'ide', 0.0, 1.0)
(-4.863680881139592, u'ihr\xedsk', 0.0, 1.0)
(-4.863680881139592, u'intranspon\xedveis', 0.0, 1.0)
(-4.863680881139592, u'in\xedcio', 0.0, 1.0)
(-4.863680881139592, u'in\xfd', 0.0, 1.0)
(-4.863680881139592, u'kde', 0.0, 1.0)
(-4.863680881139592, u'kombin\xe1cie', 0.0, 1.0)
(-4.863680881139592, u'komplex', 0.0, 1.0)
(-4.863680881139592, u'kon\u010diarmi', 0.0, 1.0)
(-4.863680881139592, u'lado', 0.0, 1.0)
(-4.863680881139592, u'lete', 0.0, 1.0)
(-4.863680881139592, u'longo', 0.0, 1.0)
(-4.863680881139592, u'ly\u017eova\u0165', 0.0, 1.0)
(-4.863680881139592, u'man\u017eelky', 0.0, 1.0)
(-4.863680881139592, u'mas', 0.0, 1.0)
(-4.863680881139592, u'mesmo', 0.0, 1.0)
(-4.863680881139592, u'meu', 0.0, 1.0)
(-4.863680881139592, u'minha', 0.0, 1.0)
(-4.863680881139592, u'mo\u017enos\u0165ami', 0.0, 1.0)
(-4.863680881139592, u'm\xe3e', 0.0, 1.0)
(-4.863680881139592, u'nad\u0161en\xfdmi', 0.0, 1.0)
(-4.863680881139592, u'negativas', 0.0, 1.0)
(-4.863680881139592, u'nie', 0.0, 1.0)
(-4.863680881139592, u'nieko\u013ek\xfdch', 0.0, 1.0)
(-4.863680881139592, u'para', 0.0, 1.0)
(-4.863680881139592, u'parecem', 0.0, 1.0)
(-4.863680881139592, u'pod', 0.0, 1.0)
(-4.863680881139592, u'pon\xfakaj\xfa', 0.0, 1.0)
(-4.863680881139592, u'potrebuj\xfa', 0.0, 1.0)
(-4.863680881139592, u'pri', 0.0, 1.0)
(-4.863680881139592, u'prova\xe7\xf5es', 0.0, 1.0)
(-4.863680881139592, u'punham', 0.0, 1.0)
(-4.863680881139592, u'qual', 0.0, 1.0)
(-4.863680881139592, u'qualquer', 0.0, 1.0)
(-4.863680881139592, u'quem', 0.0, 1.0)
(-4.863680881139592, u'rak\xfaske', 0.0, 1.0)
(-4.863680881139592, u'rezortov', 0.0, 1.0)
(-4.863680881139592, u'sa', 0.0, 1.0)
(-4.863680881139592, u'sebe', 0.0, 1.0)
(-4.863680881139592, u'sempre', 0.0, 1.0)
(-4.863680881139592, u'situa\xe7\xf5es', 0.0, 1.0)
(-4.863680881139592, u'spojen\xfdch', 0.0, 1.0)
(-4.863680881139592, u'suplantar', 0.0, 1.0)
(-4.863680881139592, u's\xfa', 0.0, 1.0)
(-4.863680881139592, u'tak', 0.0, 1.0)
(-4.863680881139592, u'talianske', 0.0, 1.0)
(-4.863680881139592, u'teve', 0.0, 1.0)
(-4.863680881139592, u'tive', 0.0, 1.0)
(-4.863680881139592, u'todas', 0.0, 1.0)
(-4.863680881139592, u'tr\xe1venia', 0.0, 1.0)
(-4.863680881139592, u've\u013ek\xfd', 0.0, 1.0)
(-4.863680881139592, u'vida', 0.0, 1.0)
(-4.863680881139592, u'vo', 0.0, 1.0)
(-4.863680881139592, u'vo\u013en\xe9ho', 0.0, 1.0)
(-4.863680881139592, u'vysok\xfdmi', 0.0, 1.0)
(-4.863680881139592, u'vy\u017eitia', 0.0, 1.0)
(-4.863680881139592, u'v\xe4\u010d\u0161ine', 0.0, 1.0)
(-4.863680881139592, u'v\u017edy', 0.0, 1.0)
(-4.863680881139592, u'zauj\xedmav\xe9', 0.0, 1.0)
(-4.863680881139592, u'zime', 0.0, 1.0)
(-4.863680881139592, u'\u010dasu', 0.0, 1.0)
(-4.863680881139592, u'\u010fal\u0161\xedmi', 0.0, 1.0)
(-4.863680881139592, u'\u0161vaj\u010diarske', 0.0, 1.0)
(-4.4582157730314274, u'de', 2.0, 2.0)
(-4.4582157730314274, u'foi', 0.0, 2.0)
(-4.4582157730314274, u'mais', 0.0, 2.0)
(-4.4582157730314274, u'me', 0.0, 2.0)
(-4.4582157730314274, u'\u010di', 0.0, 2.0)
(-4.1705337005796466, u'as', 0.0, 3.0)
(-4.1705337005796466, u'que', 4.0, 3.0)
Now we see some patterns... Seems like the higher coefficient favors a class and the other tail favors the other, so you can simply do this:
import codecs, re, time
from itertools import chain
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
trainfile = 'train.txt'
# Vectorizing data.
train = []
word_vectorizer = CountVectorizer(analyzer='word')
trainset = word_vectorizer.fit_transform(codecs.open(trainfile,'r','utf8'))
tags = ['bs','pt','bs','pt']
# Training NB
mnb = MultinomialNB()
mnb.fit(trainset, tags)
def most_informative_feature_for_binary_classification(vectorizer, classifier, n=10):
class_labels = classifier.classes_
feature_names = vectorizer.get_feature_names()
topn_class1 = sorted(zip(classifier.coef_[0], feature_names))[:n]
topn_class2 = sorted(zip(classifier.coef_[0], feature_names))[-n:]
for coef, feat in topn_class1:
print class_labels[0], coef, feat
print
for coef, feat in reversed(topn_class2):
print class_labels[1], coef, feat
most_informative_feature_for_binary_classification(word_vectorizer, mnb)
[out]:
bs -5.5568280617 acuerdo
bs -5.5568280617 al
bs -5.5568280617 alex
bs -5.5568280617 algo
bs -5.5568280617 andaba
bs -5.5568280617 andrea
bs -5.5568280617 bien
bs -5.5568280617 buscando
bs -5.5568280617 como
bs -5.5568280617 con
pt -4.17053370058 que
pt -4.17053370058 as
pt -4.45821577303 či
pt -4.45821577303 me
pt -4.45821577303 mais
pt -4.45821577303 foi
pt -4.45821577303 de
pt -4.86368088114 švajčiarske
pt -4.86368088114 ďalšími
pt -4.86368088114 času
Actually if you've read @larsmans comment carefully, he gave the hint on the binary classes' coefficient in How to get most informative features for scikit-learn classifiers?
Basically you need:
def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
labelid = list(classifier.classes_).index(classlabel)
feature_names = vectorizer.get_feature_names()
topn = sorted(zip(classifier.coef_[labelid], feature_names))[-n:]
for coef, feat in topn:
print classlabel, feat, coef
classifier.classes_
accesses the index of the class labels you have in the classifier
vectorizer.get_feature_names()
is self-explanatory
sorted(zip(classifier.coef_[labelid], feature_names))[-n:]
retrieves the coefficient of the classifier for a given class label and then sorts it in ascending order.
I'm going to use a simple example from https://github.com/alvations/bayesline
Input file train.txt
:
$ echo """Pošto je EULEX obećao da će obaviti istragu o prošlosedmičnom izbijanju nasilja na sjeveru Kosova, taj incident predstavlja još jedan ispit kapaciteta misije da doprinese jačanju vladavine prava.
> De todas as provações que teve de suplantar ao longo da vida, qual foi a mais difícil? O início. Qualquer começo apresenta dificuldades que parecem intransponíveis. Mas tive sempre a minha mãe do meu lado. Foi ela quem me ajudou a encontrar forças para enfrentar as situações mais decepcionantes, negativas, as que me punham mesmo furiosa.
> Al parecer, Andrea Guasch pone que una relación a distancia es muy difícil de llevar como excusa. Algo con lo que, por lo visto, Alex Lequio no está nada de acuerdo. ¿O es que más bien ya ha conseguido la fama que andaba buscando?
> Vo väčšine golfových rezortov ide o veľký komplex niekoľkých ihrísk blízko pri sebe spojených s hotelmi a ďalšími možnosťami trávenia voľného času – nie vždy sú manželky či deti nadšenými golfistami, a tak potrebujú iný druh vyžitia. Zaujímavé kombinácie ponúkajú aj rakúske, švajčiarske či talianske Alpy, kde sa dá v zime lyžovať a v lete hrať golf pod vysokými alpskými končiarmi.""" > test.in
Code:
import codecs, re, time
from itertools import chain
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
trainfile = 'train.txt'
# Vectorizing data.
train = []
word_vectorizer = CountVectorizer(analyzer='word')
trainset = word_vectorizer.fit_transform(codecs.open(trainfile,'r','utf8'))
tags = ['bs','pt','es','sr']
# Training NB
mnb = MultinomialNB()
mnb.fit(trainset, tags)
def most_informative_feature_for_class(vectorizer, classifier, classlabel, n=10):
labelid = list(classifier.classes_).index(classlabel)
feature_names = vectorizer.get_feature_names()
topn = sorted(zip(classifier.coef_[labelid], feature_names))[-n:]
for coef, feat in topn:
print classlabel, feat, coef
most_informative_feature_for_class(word_vectorizer, mnb, 'bs')
print
most_informative_feature_for_class(word_vectorizer, mnb, 'pt')
[out]:
bs obećao -4.50534985071
bs pošto -4.50534985071
bs prava -4.50534985071
bs predstavlja -4.50534985071
bs prošlosedmičnom -4.50534985071
bs sjeveru -4.50534985071
bs taj -4.50534985071
bs vladavine -4.50534985071
bs će -4.50534985071
bs da -4.0998847426
pt teve -4.63472898823
pt tive -4.63472898823
pt todas -4.63472898823
pt vida -4.63472898823
pt de -4.22926388012
pt foi -4.22926388012
pt mais -4.22926388012
pt me -4.22926388012
pt as -3.94158180767
pt que -3.94158180767