In [0]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

twenty_train = fetch_20newsgroups(subset="train", \
                            categories=['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'], \
                          shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset="test", \
                            categories=['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'], \
                          shuffle=True, random_state=42)

text_classify_pipeline = Pipeline([
    ("vect", CountVectorizer()),
    ("tfidf", TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2'))
])

parameters = {
    'vect__ngram_range' :[(1, 1), (1, 2)],
    'tfidf__use_idf' : (True, False),
    'clf__alpha' : (1e-2, 1e-3)
}

gs_clf= GridSearchCV(text_classify_pipeline, parameters, cv=5, n_jobs=-1)

gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)



In [0]:
print(gs_clf.best_score_)
for param in sorted(parameters.keys()):
  print("%s : %r" % (param, gs_clf.best_params_[param]))

0.9672131147540983
clf__alpha : 0.001
tfidf__use_idf : True
vect__ngram_range : (1, 1)


In [0]:
final_pipeline = Pipeline([
    ("vect", CountVectorizer(ngram_range=(1,1))),
    ("tfidf", TfidfTransformer(use_idf=True)),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=0.001))
])

final_pipeline = final_pipeline.fit(twenty_train.data, twenty_train.target)

final_pipeline.score(twenty_test.data, twenty_test.target)



0.9107856191744341

In [0]:
twenty_test.target.shape

(1502,)