forked from scikit-learn/scikit-learn
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hetero_feature_union.py
181 lines (142 loc) · 6.09 KB
/
hetero_feature_union.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
=============================================
Feature Union with Heterogeneous Data Sources
=============================================
Datasets can often contain components of that require different feature
extraction and processing pipelines. This scenario might occur when:
1. Your dataset consists of heterogeneous data types (e.g. raster images and
text captions)
2. Your dataset is stored in a Pandas DataFrame and different columns
require different processing pipelines.
This example demonstrates how to use
:class:`sklearn.feature_extraction.FeatureUnion` on a dataset containing
different types of features. We use the 20-newsgroups dataset and compute
standard bag-of-words features for the subject line and body in separate
pipelines as well as ad hoc features on the body. We combine them (with
weights) using a FeatureUnion and finally train a classifier on the combined
set of features.
The choice of features is not particularly helpful, but serves to illustrate
the technique.
"""
# Author: Matt Terry <[email protected]>
#
# License: BSD 3 clause
from __future__ import print_function
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
class ItemSelector(BaseEstimator, TransformerMixin):
"""For data grouped by feature, select subset of data at a provided key.
The data is expected to be stored in a 2D data structure, where the first
index is over features and the second is over samples. i.e.
>> len(data[key]) == n_samples
Please note that this is the opposite convention to scikit-learn feature
matrixes (where the first index corresponds to sample).
ItemSelector only requires that the collection implement getitem
(data[key]). Examples include: a dict of lists, 2D numpy array, Pandas
DataFrame, numpy record array, etc.
>> data = {'a': [1, 5, 2, 5, 2, 8],
'b': [9, 4, 1, 4, 1, 3]}
>> ds = ItemSelector(key='a')
>> data['a'] == ds.transform(data)
ItemSelector is not designed to handle data grouped by sample. (e.g. a
list of dicts). If your data is structured this way, consider a
transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.
Parameters
----------
key : hashable, required
The key corresponding to the desired value in a mappable.
"""
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class TextStats(BaseEstimator, TransformerMixin):
"""Extract features from each document for DictVectorizer"""
def fit(self, x, y=None):
return self
def transform(self, posts):
return [{'length': len(text),
'num_sentences': text.count('.')}
for text in posts]
class SubjectBodyExtractor(BaseEstimator, TransformerMixin):
"""Extract the subject & body from a usenet post in a single pass.
Takes a sequence of strings and produces a dict of sequences. Keys are
`subject` and `body`.
"""
def fit(self, x, y=None):
return self
def transform(self, posts):
features = np.recarray(shape=(len(posts),),
dtype=[('subject', object), ('body', object)])
for i, text in enumerate(posts):
headers, _, bod = text.partition('\n\n')
bod = strip_newsgroup_footer(bod)
bod = strip_newsgroup_quoting(bod)
features['body'][i] = bod
prefix = 'Subject:'
sub = ''
for line in headers.split('\n'):
if line.startswith(prefix):
sub = line[len(prefix):]
break
features['subject'][i] = sub
return features
pipeline = Pipeline([
# Extract the subject & body
('subjectbody', SubjectBodyExtractor()),
# Use FeatureUnion to combine the features from subject and body
('union', FeatureUnion(
transformer_list=[
# Pipeline for pulling features from the post's subject line
('subject', Pipeline([
('selector', ItemSelector(key='subject')),
('tfidf', TfidfVectorizer(min_df=50)),
])),
# Pipeline for standard bag-of-words model for body
('body_bow', Pipeline([
('selector', ItemSelector(key='body')),
('tfidf', TfidfVectorizer()),
('best', TruncatedSVD(n_components=50)),
])),
# Pipeline for pulling ad hoc features from post's body
('body_stats', Pipeline([
('selector', ItemSelector(key='body')),
('stats', TextStats()), # returns a list of dicts
('vect', DictVectorizer()), # list of dicts -> feature matrix
])),
],
# weight components in FeatureUnion
transformer_weights={
'subject': 0.8,
'body_bow': 0.5,
'body_stats': 1.0,
},
)),
# Use a SVC classifier on the combined features
('svc', SVC(kernel='linear')),
])
# limit the list of categories to make running this example faster.
categories = ['alt.atheism', 'talk.religion.misc']
train = fetch_20newsgroups(random_state=1,
subset='train',
categories=categories,
)
test = fetch_20newsgroups(random_state=1,
subset='test',
categories=categories,
)
pipeline.fit(train.data, train.target)
y = pipeline.predict(test.data)
print(classification_report(y, test.target))