分组对象与apply函数
- 函数apply即可用于分组对象,也可以作用于dataframe数据
- Groupby.apply(func)
- 需要注意axis=0和axis=1的区别
- np.sum, axis=0 相当于计算每列的总和
- np.sum, axis=1 相当于计算每行的总和
import pandas as pd
import numpy as np
import os
os.getcwd()
'D:\\Jupyter\\notebook\\Python数据清洗实战\\数据清洗之数据统计'
os.chdir('D:\\Jupyter\\notebook\\Python数据清洗实战\\数据')
df = pd.read_csv('online_order.csv', encoding='gbk', dtype={'customer':str, 'order':str})
df.head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border=”1″ class=”dataframe”>
<thead>
<tr style="text-align: right;">
<th></th>
<th>customer</th>
<th>order</th>
<th>total_items</th>
<th>discount%</th>
<th>weekday</th>
<th>hour</th>
<th>Food%</th>
<th>Fresh%</th>
<th>Drinks%</th>
<th>Home%</th>
<th>Beauty%</th>
<th>Health%</th>
<th>Baby%</th>
<th>Pets%</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>0</td>
<td>0</td>
<td>45</td>
<td>23.03</td>
<td>4</td>
<td>13</td>
<td>9.46</td>
<td>87.06</td>
<td>3.48</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.0</td>
<td>0.0</td>
</tr>
<tr>
<th>1</th>
<td>0</td>
<td>1</td>
<td>38</td>
<td>1.22</td>
<td>5</td>
<td>13</td>
<td>15.87</td>
<td>75.80</td>
<td>6.22</td>
<td>2.12</td>
<td>0.00</td>
<td>0.00</td>
<td>0.0</td>
<td>0.0</td>
</tr>
<tr>
<th>2</th>
<td>0</td>
<td>2</td>
<td>51</td>
<td>18.08</td>
<td>4</td>
<td>13</td>
<td>16.88</td>
<td>56.75</td>
<td>3.37</td>
<td>16.48</td>
<td>6.53</td>
<td>0.00</td>
<td>0.0</td>
<td>0.0</td>
</tr>
<tr>
<th>3</th>
<td>1</td>
<td>3</td>
<td>57</td>
<td>16.51</td>
<td>1</td>
<td>12</td>
<td>28.81</td>
<td>35.99</td>
<td>11.78</td>
<td>4.62</td>
<td>2.87</td>
<td>15.92</td>
<td>0.0</td>
<td>0.0</td>
</tr>
<tr>
<th>4</th>
<td>1</td>
<td>4</td>
<td>53</td>
<td>18.31</td>
<td>2</td>
<td>11</td>
<td>24.13</td>
<td>60.38</td>
<td>7.78</td>
<td>7.72</td>
<td>0.00</td>
<td>0.00</td>
<td>0.0</td>
<td>0.0</td>
</tr>
</tbody>
</table>
</div>
grouped = df.groupby('weekday')
# 只可传入一个统计参数
# agg可传入多个
# grouped.apply([np.mean, np.sum])
grouped.apply(np.mean)[['total_items', 'discount%', 'weekday']]
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border=”1″ class=”dataframe”>
<thead>
<tr style="text-align: right;">
<th></th>
<th>total_items</th>
<th>discount%</th>
<th>weekday</th>
</tr>
<tr>
<th>weekday</th>
<th></th>
<th></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<th>1</th>
<td>30.662177</td>
<td>8.580705</td>
<td>1.0</td>
</tr>
<tr>
<th>2</th>
<td>31.868612</td>
<td>8.638014</td>
<td>2.0</td>
</tr>
<tr>
<th>3</th>
<td>31.869796</td>
<td>7.794507</td>
<td>3.0</td>
</tr>
<tr>
<th>4</th>
<td>32.251899</td>
<td>8.068155</td>
<td>4.0</td>
</tr>
<tr>
<th>5</th>
<td>31.406619</td>
<td>9.159031</td>
<td>5.0</td>
</tr>
<tr>
<th>6</th>
<td>32.154814</td>
<td>8.414258</td>
<td>6.0</td>
</tr>
<tr>
<th>7</th>
<td>32.373837</td>
<td>8.710171</td>
<td>7.0</td>
</tr>
</tbody>
</table>
</div>
df.columns
Index(['customer', 'order', 'total_items', 'discount%', 'weekday', 'hour',
'Food%', 'Fresh%', 'Drinks%', 'Home%', 'Beauty%', 'Health%', 'Baby%',
'Pets%'],
dtype='object')
var = ['Food%', 'Fresh%', 'Drinks%', 'Home%', 'Beauty%', 'Health%', 'Baby%',
'Pets%']
df[var].head(5)
<div>
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
<table border=”1″ class=”dataframe”>
<thead>
<tr style="text-align: right;">
<th></th>
<th>Food%</th>
<th>Fresh%</th>
<th>Drinks%</th>
<th>Home%</th>
<th>Beauty%</th>
<th>Health%</th>
<th>Baby%</th>
<th>Pets%</th>
</tr>
</thead>
<tbody>
<tr>
<th>0</th>
<td>9.46</td>
<td>87.06</td>
<td>3.48</td>
<td>0.00</td>
<td>0.00</td>
<td>0.00</td>
<td>0.0</td>
<td>0.0</td>
</tr>
<tr>
<th>1</th>
<td>15.87</td>
<td>75.80</td>
<td>6.22</td>
<td>2.12</td>
<td>0.00</td>
<td>0.00</td>
<td>0.0</td>
<td>0.0</td>
</tr>
<tr>
<th>2</th>
<td>16.88</td>
<td>56.75</td>
<td>3.37</td>
<td>16.48</td>
<td>6.53</td>
<td>0.00</td>
<td>0.0</td>
<td>0.0</td>
</tr>
<tr>
<th>3</th>
<td>28.81</td>
<td>35.99</td>
<td>11.78</td>
<td>4.62</td>
<td>2.87</td>
<td>15.92</td>
<td>0.0</td>
<td>0.0</td>
</tr>
<tr>
<th>4</th>
<td>24.13</td>
<td>60.38</td>
<td>7.78</td>
<td>7.72</td>
<td>0.00</td>
<td>0.00</td>
<td>0.0</td>
<td>0.0</td>
</tr>
</tbody>
</table>
</div>
# 计算每个变量的总和
df[var].apply(np.sum, axis=0)
Food% 706812.19
Fresh% 606818.38
Drinks% 700477.06
Home% 406187.25
Beauty% 176788.48
Health% 33988.76
Baby% 332884.34
Pets% 31292.61
dtype: float64
# 对每一行求和
df[var].apply(np.sum, axis=1).head(5)
0 100.00
1 100.01
2 100.01
3 99.99
4 100.01
dtype: float64
# Food% - Fresh%
df[var].apply(lambda x: x[0] - x[1], axis=1).head(5)
0 -77.60
1 -59.93
2 -39.87
3 -7.18
4 -36.25
dtype: float64
正文完