{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# equals\n",
"## DataFrameが等しいことを確認する\n",
"\n",
"2つのDataFrameを比較して正しいことを確認する機会があった時のメモ"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## NAを含むDataFrameを作成"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 4 | \n",
" NaN | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 4 | \n",
" NaN | \n",
" 4 | \n",
"
\n",
" \n",
" 2 | \n",
" 2 | \n",
" 4 | \n",
" NaN | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d\n",
"0 1 4 NaN 1\n",
"1 4 4 NaN 4\n",
"2 2 4 NaN 3"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.seed(0)\n",
"df = pd.DataFrame(np.random.random_integers(1, 4, size=(3, 4)), columns=list(\"abcde\"))\n",
"df[\"c\"] = np.nan\n",
"other = df.copy()\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 各要素が等しいか, DataFrame同士が等しいかを確認"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" True | \n",
" True | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" 1 | \n",
" True | \n",
" True | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
" 2 | \n",
" True | \n",
" True | \n",
" False | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d\n",
"0 True True False True\n",
"1 True True False True\n",
"2 True True False True"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df == other"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(False, True)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.nan == np.nan, np.nan != np.nan"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.equals(other)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"NA同士は等しくない(SQLにおけるNULL)が、DataFrameとしては等しい"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 等しくない場合、どこが等しくないかを確認する\n",
"\n",
"- NAを特定の文字列にし、要素の比較をしたときに等しくなるようにする\n",
"- DataFrame同士が等しくないようにするため、otherを変更する"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 4 | \n",
" 100 | \n",
" NA String | \n",
" 1 | \n",
"
\n",
" \n",
" 1 | \n",
" 4 | \n",
" 4 | \n",
" NA String | \n",
" 4 | \n",
"
\n",
" \n",
" 2 | \n",
" 4 | \n",
" 4 | \n",
" NA String | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d\n",
"0 4 100 NA String 1\n",
"1 4 4 NA String 4\n",
"2 4 4 NA String 3"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df.fillna(\"NA String\")\n",
"other = other.fillna(\"NA String\")\n",
"other[\"a\"] = 4\n",
"other.iloc[0, 1] = 100\n",
"other"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" False | \n",
" False | \n",
" True | \n",
" True | \n",
"
\n",
" \n",
" 1 | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
"
\n",
" \n",
" 2 | \n",
" False | \n",
" True | \n",
" True | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d\n",
"0 False False True True\n",
"1 True True True True\n",
"2 False True True True"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# == の method version\n",
"eq = df.eq(other)\n",
"eq"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.equals(other)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"- NAであった要素は等しくなっている\n",
"- 変更をしたため、DataFrameとしては等しくない"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 等しくないColumnとIndexの特定およびどれくらい等しいか"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"要素比較結果のDataFrameに対してallをColumnとIndex方向の両方に適用して特定する"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" False | \n",
"
\n",
" \n",
" 1 | \n",
" True | \n",
"
\n",
" \n",
" 2 | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0\n",
"0 False\n",
"1 True\n",
"2 False"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(eq.all(axis=1))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" False | \n",
" False | \n",
" True | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d\n",
"0 False False True True"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(eq.all()).T"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
" \n",
" \n",
" | \n",
" a | \n",
" b | \n",
" c | \n",
" d | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1.000000 | \n",
" 2.000000 | \n",
" 3 | \n",
" 3 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.333333 | \n",
" 0.666667 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" a b c d\n",
"0 1.000000 2.000000 3 3\n",
"1 0.333333 0.666667 1 1"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.concat(\n",
" [\n",
" pd.DataFrame(eq.sum()).T,\n",
" pd.DataFrame(eq.sum()).T / len(df)\n",
" ]\n",
", ignore_index=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"None\n",
" a b c d\n",
"0 33.33% 66.67% 100.00% 100.00%\n"
]
}
],
"source": [
"print(pd.options.display.float_format)\n",
"with pd.option_context(\"display.float_format\", \"{:.2f}%\".format):\n",
" print(pd.DataFrame(eq.sum()).T / len(df) * 100)"
]
}
],
"metadata": {
"hide_input": false,
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
},
"toc": {
"toc_cell": false,
"toc_number_sections": true,
"toc_threshold": 6,
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 0
}