VarianceThreshold

In [2]:
from sklearn.feature_selection import VarianceThreshold
import numpy as np
import pandas as pd

In [20]:
rng = np.random.RandomState(0)
data = np.c_[
    rng.rand(100).reshape(-1, 1),
    rng.randn(100).reshape(-1, 1)
]
In [21]:
data[:5, :]
Out[21]:
array([[ 0.5488135 , -1.16514984],
       [ 0.71518937,  0.90082649],
       [ 0.60276338,  0.46566244],
       [ 0.54488318, -1.53624369],
       [ 0.4236548 ,  1.48825219]])
In [24]:
np.var(data, axis=0)
Out[24]:
array([ 0.08311782,  0.99276922])
In [27]:
vt = VarianceThreshold(threshold=0.8 *(1 - 0.8))
In [28]:
vt.fit_transform(data_uni)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-28-4b88c5698496> in <module>()
----> 1 vt.fit_transform(data_uni)

/Users/knt/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    492         if y is None:
    493             # fit method of arity 1 (unsupervised transformation)
--> 494             return self.fit(X, **fit_params).transform(X)
    495         else:
    496             # fit method of arity 2 (supervised transformation)

/Users/knt/.pyenv/versions/anaconda3-4.2.0/lib/python3.5/site-packages/sklearn/feature_selection/variance_threshold.py in fit(self, X, y)
     73             if X.shape[0] == 1:
     74                 msg += " (X contains only one sample)"
---> 75             raise ValueError(msg.format(self.threshold))
     76
     77         return self

ValueError: No feature in X meets the variance threshold 0.16000
In [ ]:

In [40]:
from sklearn.feature_selection import VarianceThreshold
X = np.array([
    [0, 0, 1],
    [0, 1, 0],
    [1, 0, 0],
    [0, 1, 1],
    [0, 1, 0],
    [0, 1, 1]])
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X, sel
Out[40]:
(array([[0, 0, 1],
        [0, 1, 0],
        [1, 0, 0],
        [0, 1, 1],
        [0, 1, 0],
        [0, 1, 1]]), VarianceThreshold(threshold=0.15999999999999998))
In [42]:
sel.fit_transform(X)

Out[42]:
array([[0, 1],
       [1, 0],
       [0, 0],
       [1, 1],
       [1, 0],
       [1, 1]])
In [47]:
X.var(axis=0), sel.variances_
Out[47]:
(array([ 0.13888889,  0.22222222,  0.25      ]),
 array([ 0.13888889,  0.22222222,  0.25      ]))
In [45]:
sel.variances_ > sel.threshold
Out[45]:
array([False,  True,  True], dtype=bool)
In [ ]: