VarianceThreshold¶
In [2]:
from sklearn.feature_selection import VarianceThreshold
import numpy as np
import pandas as pd
In [20]:
rng = np.random.RandomState(0)
data = np.c_[
rng.rand(100).reshape(-1, 1),
rng.randn(100).reshape(-1, 1)
]
In [21]:
data[:5, :]
Out[21]:
array([[ 0.5488135 , -1.16514984],
[ 0.71518937, 0.90082649],
[ 0.60276338, 0.46566244],
[ 0.54488318, -1.53624369],
[ 0.4236548 , 1.48825219]])
In [24]:
np.var(data, axis=0)
Out[24]:
array([ 0.08311782, 0.99276922])
In [27]:
vt = VarianceThreshold(threshold=0.8 *(1 - 0.8))
In [28]:
vt.fit_transform(data_uni)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-28-4b88c5698496> in <module>()
----> 1 vt.fit_transform(data_uni)
/Users/knt/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
492 if y is None:
493 # fit method of arity 1 (unsupervised transformation)
--> 494 return self.fit(X, **fit_params).transform(X)
495 else:
496 # fit method of arity 2 (supervised transformation)
/Users/knt/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/sklearn/feature_selection/variance_threshold.py in fit(self, X, y)
73 if X.shape[0] == 1:
74 msg += " (X contains only one sample)"
---> 75 raise ValueError(msg.format(self.threshold))
76
77 return self
ValueError: No feature in X meets the variance threshold 0.16000
In [ ]:
In [40]:
from sklearn.feature_selection import VarianceThreshold
X = np.array([
[0, 0, 1],
[0, 1, 0],
[1, 0, 0],
[0, 1, 1],
[0, 1, 0],
[0, 1, 1]])
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X, sel
Out[40]:
(array([[0, 0, 1],
[0, 1, 0],
[1, 0, 0],
[0, 1, 1],
[0, 1, 0],
[0, 1, 1]]), VarianceThreshold(threshold=0.15999999999999998))
In [42]:
sel.fit_transform(X)
Out[42]:
array([[0, 1],
[1, 0],
[0, 0],
[1, 1],
[1, 0],
[1, 1]])
In [47]:
X.var(axis=0), sel.variances_
Out[47]:
(array([ 0.13888889, 0.22222222, 0.25 ]),
array([ 0.13888889, 0.22222222, 0.25 ]))
In [45]:
sel.variances_ > sel.threshold
Out[45]:
array([False, True, True], dtype=bool)
In [ ]: