Python Programming

Lecture 11 Data Combine and Group Operations

11.1 Pandas (5)

Data Wrangling: Join, Combine and Reshape
World Happiness Report (2015-2020)
Batch Loading .csv files

import os
import pandas as pd

os.chdir('happiness')
current_path = os.getcwd()
files=os.listdir()
df=[]
for filename in files:
    dot_position = filename.find('.')
    file_key = filename[:dot_position]
    t = (pd.read_csv(os.path.join(current_path,filename)), int(file_key))
    df.append(t)
os.chdir('..')

# [(..., 2015), (..., 2016), (..., 2017),..., (..., 2020)]

>>> df[0][0].shape
(138, 10)

>>> df[0][0].columns
Index(['Happiness Rank', 'Country', 'Region', 'Happiness Score', 'Economy',
       'Family', 'Health', 'Freedom', 'Trust', 'Generosity'],dtype='object')
Hierarchical Indexing

# 做一个WHR 2015的副本
>>> df15 = df[0][0].copy()
>>> df15_h = df15.set_index(["Region","Country"])
>>> df15_h.head()
                            Happiness Rank  ...  Generosity
Region         Country                      ...            
Western Europe Switzerland               1  ...     0.29678
               Iceland                   2  ...     0.43630
               Denmark                   3  ...     0.34139
               Norway                    4  ...     0.34699
North America  Canada                    5  ...     0.45811

Select Concisely


>>> df15_h.loc["North America"]
               Happiness Rank  Happiness Score  ...    Trust  Generosity
Country                                         ...                     
Canada                      5            7.427  ...  0.32957     0.45811
United States              15            7.119  ...  0.15890     0.40105

>>> df15_h.loc["North America"].loc["Canada"]
Happiness Rank     5.00000
Happiness Score    7.42700
Economy            1.32629
Family             1.32261
Health             0.90563
Freedom            0.63297
Trust              0.32957
Generosity         0.45811
Name: Canada, dtype: float64

Rearrange


# 重置index
>>> df15_h.reset_index(0)
                     Region  Happiness Rank  ...    Trust  Generosity
Country                                      ...                     
Switzerland  Western Europe               1  ...  0.41978     0.29678
Iceland      Western Europe               2  ...  0.14145     0.43630
Denmark      Western Europe               3  ...  0.48357     0.34139
...

>>> df15_h.reset_index(1)
                    Country  Happiness Rank  ...    Trust  Generosity
Region                                       ...                     
Western Europe  Switzerland               1  ...  0.41978     0.29678
Western Europe      Iceland               2  ...  0.14145     0.43630
Western Europe      Denmark               3  ...  0.48357     0.34139
...

>>> df15_h.reset_index()
                 Region      Country  ...    Trust  Generosity
0        Western Europe  Switzerland  ...  0.41978     0.29678
1        Western Europe      Iceland  ...  0.14145     0.43630
2        Western Europe      Denmark  ...  0.48357     0.34139
...

>>> df15_h.reset_index().reset_index()
     index              Region      Country  ...     Trust  Generosity
0        0      Western Europe  Switzerland  ...   0.41978     0.29678
1        1      Western Europe      Iceland  ...   0.14145     0.43630
2        2      Western Europe      Denmark  ...   0.48357     0.34139
...

>>> df15.stack()
0    Happiness Rank                  1
     Country               Switzerland
     Region             Western Europe
     Happiness Score             7.587
     Economy                   1.39651
     ...                       ...
137  Family                    0.13995
     Health                    0.28443
     Freedom                   0.36453
     Trust                     0.10731
     Generosity                0.16681
Length: 1380, dtype: object

# 反向还原
>>> df15.stack().unstack()
Combining and Merging

# 提取每一年的排名
rank_list=[]
for i in range(len(df)):
    rank_list.append(df[i][0][['Country','Happiness Rank']].copy())

>>> wide=pd.merge(rank_list[0],rank_list[1],
...: on="Country", suffixes=[" 15"," 16"])
>>> wide=wide.rename(columns={"Happiness Rank 15":15, 
...:                          "Happiness Rank 16":16})
>>> wide

         Country   15   16
0    Switzerland    1    2
1        Iceland    2    3
2        Denmark    3    1
..           ...  ...  ...
135        Benin  155  153
136      Burundi  157  157
137         Togo  158  155

>>> df1
  name  number  score
0  Tom     101     77
1  Bob     102     85
2  Amy     103     64
3  Lee     104     92

>>> df2
   number  class
0     101    one
1     102    one
2     103    two
3     104  three

>>> pd.merge(df1,df2)
  name  number  score  class
0  Tom     101     77    one
1  Bob     102     85    one
2  Amy     103     64    two
3  Lee     104     92  three

>>> df3
     n  class
0  101    one
1  102    one
2  103    two
3  104  three

column names are different


>>> pd.merge(df1,df3,left_on='number',right_on='n')
  name  number  score    n  class
0  Tom     101     77  101    one
1  Bob     102     85  102    one
2  Amy     103     64  103    two
3  Lee     104     92  104  three

>>> df1

  name  number  score
0  Tom     101     77
1  Bob     102     85
2  Amy     103     64

>>> df4
   number  f_score
0     101       77
1     101       55
2     102       60
3     103       99

>>> pd.merge(df1,df4)
  name  number  score  f_score
0  Tom     101     77       77
1  Tom     101     77       55
2  Bob     102     85       60
3  Amy     103     64       99

>>> df5
  name  number  score
0  Tom     101     77
1  Tom     101     86
2  Bob     102     85
3  Amy     103     64

Many to Many


>>> pd.merge(df5,df4)
  name  number  score  f_score
0  Tom     101     77       77
1  Tom     101     77       55
2  Tom     101     86       77
3  Tom     101     86       55
4  Bob     102     85       60
5  Amy     103     64       99

>>> df6
  name  number  score
0  Tom     101     77
1  Bob     102     85
2  Amy     103     64
3  Ivy     105     82

>>> df7
   number  f_score
0     101       77
1     102       55
2     103       60
3     104       99

>>> pd.merge(df6,df7)

  name  number  score  f_score
0  Tom     101     77       77
1  Bob     102     85       55
2  Amy     103     64       60

>>> pd.merge(df6,df7,how='left')
  name  number  score  f_score
0  Tom     101     77     77.0
1  Bob     102     85     55.0
2  Amy     103     64     60.0
3  Ivy     105     82      NaN

>>> pd.merge(df6,df7,how="right")

  name  number  score  f_score
0  Tom     101   77.0       77
1  Bob     102   85.0       55
2  Amy     103   64.0       60
3  NaN     104    NaN       99

>>> pd.merge(df6,df7,how="outer")
  name  number  score  f_score
0  Tom     101   77.0     77.0
1  Bob     102   85.0     55.0
2  Amy     103   64.0     60.0
3  Ivy     105   82.0      NaN
4  NaN     104    NaN     99.0

Merging on Index


>>> df1
       name  score
number            
101     Tom     77
102     Bob     85
103     Amy     64
104     Lee     92

>>> df2

   number  class
0     101    one
1     102    one
2     103    two
3     104  three

>>> pd.merge(df1,df2,left_index=True, 
...:        right_on="number")
  name  score  number  class
0  Tom     77     101    one
1  Bob     85     102    one
2  Amy     64     103    two
3  Lee     92     104  three

>>> df3
        class
number       
101       one
102       one
103       two
104     three

A simple way


>>> df1.join(df3)

Concatenating


>>> long = pd.concat([rank_list[0],rank_list[1]])
>>> long
         Country  Happiness Rank
0    Switzerland               1
1        Iceland               2
2        Denmark               3
..           ...             ...
135       Rwanda             152
136     Zimbabwe             131
137  Afghanistan             154

>>> long = pd.concat([rank_list[0],rank_list[1]],
...:                ignore_index=True)
>>> long
         Country  Happiness Rank
0    Switzerland               1
1        Iceland               2
2        Denmark               3
..           ...             ...
273       Rwanda             152
274     Zimbabwe             131
275  Afghanistan             154

>>> long = pd.concat([rank_list[0],rank_list[1]],
...:                  keys=[15,16])
>>> long
            Country  Happiness Rank
15 0    Switzerland               1
   1        Iceland               2
   2        Denmark               3
                ...             ...
16 135       Rwanda             152
   136     Zimbabwe             131
   137  Afghanistan             154

>>> long = long.reset_index(0)
>>> long = long.rename(columns={"level_0": "Year"})
>>> long
     Year      Country  Happiness Rank
0      15  Switzerland               1
1      15      Iceland               2
2      15      Denmark               3
..    ...          ...             ...
135    16       Rwanda             152
136    16     Zimbabwe             131
137    16  Afghanistan             154
Reshaping and Pivoting

The wide format (用merge得到的数据,country不重,多列年份)

The long format (用concat得到的数据,country重复,单列年份)


>>> wide
         Country   15   16
0    Switzerland    1    2
1        Iceland    2    3
2        Denmark    3    1
..           ...  ...  ...
135        Benin  155  153
136      Burundi  157  157
137         Togo  158  155

>>> long
     Year      Country  Happiness Rank
0      15  Switzerland               1
1      15      Iceland               2
2      15      Denmark               3
..    ...          ...             ...
135    16  Afghanistan             154
136    16         Togo             155
137    16      Burundi             157

The wide format (用merge得到的数据,country不重,多列年份)

From wide to long


>>> m  = pd.melt(wide, id_vars=['Country'],  
...:     var_name="Year",value_name="rank")
>>> m
         Country Year  rank
0    Switzerland   15     1
1        Iceland   15     2
2        Denmark   15     3
..           ...  ...   ...
273        Benin   16   153
274      Burundi   16   157
275         Togo   16   155

>>> m.set_index(['Year', m.index])
              Country  rank
Year                       
15   0    Switzerland     1
     1        Iceland     2
     2        Denmark     3
                  ...   ...
16   273        Benin   153
     274      Burundi   157
     275         Togo   155

The long format (用concat得到的数据,country重复,单列年份)

From long to wide


>>> l = pd.pivot_table(long, index=["Country"], 
...:    columns="Year", values="Happiness Rank")
Year          15   16
Country              
Afghanistan  153  154
Albania       95  109
Algeria       68   38
             ...  ...
Yemen        136  147
Zambia        85  106
Zimbabwe     115  131

>>> l.reset_index()
Year      Country   15   16
0     Afghanistan  153  154
1         Albania   95  109
2         Algeria   68   38
..            ...  ...  ...
135         Yemen  136  147
136        Zambia   85  106
137      Zimbabwe  115  131

11.2 Pandas (6)

Data Aggregation and Group Operations
GroupBy Mechanics

table_all =[]
year_all=[]
for table, year in df:
    table_all.append(table)
    year_all.append(year)
rank_all = pd.concat(table_all, keys=year_all,join='inner')
rank_all = rank_all.reset_index(0)
rank_all = rank_all.rename(columns={'level_0':'Year'})

     year  Happiness Rank      Country  ...   Freedom     Trust  Generosity
0    2015               1  Switzerland  ...  0.665570  0.419780    0.296780
1    2015               2      Iceland  ...  0.628770  0.141450    0.436300
2    2015               3      Denmark  ...  0.649380  0.483570    0.341390
3    2015               4       Norway  ...  0.669730  0.365030    0.346990
4    2015               5       Canada  ...  0.632970  0.329570    0.458110
..    ...             ...          ...  ...       ...       ...         ...
133  2020             147     Botswana  ...  0.821328  0.777931   -0.250394
134  2020             148     Tanzania  ...  0.821540  0.619799    0.109669
135  2020             150       Rwanda  ...  0.900589  0.183541    0.055484
136  2020             151     Zimbabwe  ...  0.711458  0.810237   -0.072064
137  2020             153  Afghanistan  ...  0.396573  0.933687   -0.096429
Groupby Mechanics

>>> rank_all['Region'].unique()
array(['Western Europe', 'North America', 'Australia and New Zealand',
       'Middle East and Northern Africa', 'Latin America and Caribbean',
       'Southeastern Asia', 'Central and Eastern Europe', 'Eastern Asia',
       'Sub-Saharan Africa', 'Southern Asia'], dtype=object)

>>> grouped = rank_all['Happiness Rank'].groupby(rank_all['Country'])
>>> grouped
pandas.core.groupby.generic.SeriesGroupBy object at 0x000002007FEDDD08

>>> grouped.mean()
Country
Afghanistan    150.000000
Albania        106.166667
Algeria         71.833333
Argentina       35.166667
Armenia        121.666667
                   ...
Venezuela       76.333333
Vietnam         89.500000
Yemen          146.333333
Zambia         118.500000
Zimbabwe       137.500000
Name: Happiness Rank, Length: 138, dtype: float64

>>> g_1 = rank_all['Happiness Rank'].groupby([rank_all['Year'],
...:                                rank_all['Region']]).count()
>>> g_1[2015]
Region
Australia and New Zealand           2
Central and Eastern Europe         28
Eastern Asia                        4
Latin America and Caribbean        20
Middle East and Northern Africa    17
North America                       2
Southeastern Asia                   8
Southern Asia                       6
Sub-Saharan Africa                 31
Western Europe                     20
Name: Happiness Rank, dtype: int64

>>> g_1.unstack()
Region  Australia and New Zealand  ...  Western Europe
Year                               ...                
2015                            2  ...              20
2016                            2  ...              20
2017                            2  ...              20
2018                            2  ...              20
2019                            2  ...              20
2020                            2  ...              20

>>> g_2 = rank_all.groupby([rank_all['Year'],
...:                        rank_all['Region']]).mean()
>>> g_3 = g_2.unstack()
>>> g_3
                  Happiness Rank  ...     Generosity
Region Australia and New Zealand  ... Western Europe
Year                              ...               
2015                         9.5  ...       0.304130
2016                         8.5  ...       0.306789
2017                         9.0  ...       0.305329
2018                         9.0  ...       0.223850
2019                         9.5  ...       0.221050
2020                        10.0  ...       0.040858

Selecting Columns


>>> rank_all.groupby('Region')['Freedom'].mean()
# rank_all['Freedom'].groupby(rank_all['Region']).mean()

>>> rank_all.groupby(['Year','Region'])[['Freedom','Economy']].mean()

Function Application and Mapping


def f(arr):
    return arr.max()-arr.min() 

>>> g_4 = rank_all.groupby(['Year','Region'])['Happiness Rank'].apply(f)
Year  Region                         
2015  Australia and New Zealand            1
      Central and Eastern Europe         103
      Eastern Asia                        54
      Latin America and Caribbean        107
...   ...                                ...
2016  Australia and New Zealand            1
...   ...                                ...

>>> g_5 = g_4.unstack(0)
Year                             2015  2016  2017  2018  2019  2020
Region                                                             
Australia and New Zealand           1     1     2     2     3     4
Central and Eastern Europe        103   102   109   117   113   104
Eastern Asia                       54    48    49    40    39    33
Latin America and Caribbean       107   122   133   135   135   127
Middle East and Northern Africa   125   136   135   133   138   132
...

>>> r = lambda x: x.rank(method='min')
>>> g_6 = g_5.apply(r,axis='columns')
>>> g_6.applymap(int)
Year                             2015  2016  2017  2018  2019  2020
Region                                                             
Australia and New Zealand           1     1     3     3     5     6
Central and Eastern Europe          2     1     4     6     5     3
Eastern Asia                        6     4     5     3     2     1
Latin America and Caribbean         1     2     4     5     5     3
Middle East and Northern Africa     1     5     4     3     6     2
North America                       4     1     1     6     4     1
Southeastern Asia                   6     5     4     1     2     3
Southern Asia                       4     2     1     3     5     5
Sub-Saharan Africa                  1     3     2     5     4     6
Western Europe                      6     5     4     2     3     1
                    

Data Aggregation


>>> rank_all.groupby('Region')['Freedom'].aggregate(['count','mean'])
                                 count      mean
Region                                          
Australia and New Zealand           12  0.663815
Central and Eastern Europe         168  0.423580
Eastern Asia                        24  0.465703
Latin America and Caribbean        120  0.518843
Middle East and Northern Africa    102  0.406711
North America                       12  0.613810
Southeastern Asia                   48  0.602917
Southern Asia                       36  0.439074
Sub-Saharan Africa                 186  0.427333
Western Europe                     120  0.574996

# rank_all.groupby('Region').agg(['count','mean'])
# count, sum, mean, median, std, var 
# min, max, prod, first, last

>>> rank_all.groupby('Region').agg({'Year':'count', 'Freedom': "mean"})
                                 Year   Freedom
Region                                         
Australia and New Zealand          12  0.663815
Central and Eastern Europe        168  0.423580
Eastern Asia                       24  0.465703
Latin America and Caribbean       120  0.518843
Middle East and Northern Africa   102  0.406711
North America                      12  0.613810
Southeastern Asia                  48  0.602917
Southern Asia                      36  0.439074
Sub-Saharan Africa                186  0.427333
Western Europe                    120  0.574996

def f(arr):
    return arr.max()-arr.min()

>>> rank_all.groupby(['Year','Region'])['Happiness Rank'].agg(f)
Region
Australia and New Zealand            4
Central and Eastern Europe         119
Eastern Asia                        55
Latin America and Caribbean        136
Middle East and Northern Africa    141
North America                       14
Southeastern Asia                  123
Southern Asia                       88
Sub-Saharan Africa                 109
Western Europe                     101
Name: Happiness Rank, dtype: int64

Quantile and Bucket Analysis


>>> g_7 = rank_all.groupby(['Year','Country'])['Happiness Rank'].mean()
>>> g_7
Year  Country    
2015  Afghanistan    153
      Albania         95
      Algeria         68
...      ...         ...
2020  Yemen          146
      Zambia         141
      Zimbabwe       151

>>> interval_50 = pd.cut(g_7.loc[2015].values, bins = [0,50,100,150,200])
>>> interval_50
[(150, 200], (50, 100], (50, 100], ..., (50, 100], (100, 150]]
Length: 138
Categories (4, interval[int64]): 
[(0, 50] < (50, 100] < (100, 150] < (150, 200]]

>>> g_7.loc[2015].groupby(interval_50).count()
(0, 50]       45
(50, 100]     42
(100, 150]    44
(150, 200]     7

>>> interval_50 = pd.cut(g_7.loc[2015].values, bins = [0,50,100,150,200],
...: labels=['1-50','51-100','101-150','151-200'])

>>> g_7.loc[2015].groupby(interval_50).count()
1-50       45
51-100     42
101-150    44
151-200     7

>>> quantile = pd.qcut(g_7.loc[2015].values, 3)
>>> quantile
[(106.333, 158.0], (51.667, 106.333], ..., (51.667, 106.333], (106.333, 158.0]]
Length: 138
Categories (3, interval[float64]): [(0.999, 51.667] < 
                (51.667, 106.333] < (106.333, 158.0]]

>>> g_7.loc[2015].groupby(quantile).count()
(0.999, 51.667]      46
(51.667, 106.333]    46
(106.333, 158.0]     46

Pivot Tables and Cross-Tabulation (数据透视表)


>>> rank_all.pivot_table(values="Economy", columns="Year", index="Country")
Year            2015     2016      2017   2018   2019      2020
Country                                                        
Afghanistan  0.31982  0.38227  0.401477  0.332  0.350  7.462861
Albania      0.87867  0.95530  0.996193  0.916  0.947  9.417931
Algeria      0.93929  1.05266  1.091864  0.979  1.002  9.537965
                ...      ...       ...    ...    ...       ...
Yemen        0.54649  0.57939  0.591683  0.442  0.287  7.759683
Zambia       0.47038  0.61202  0.636407  0.562  0.578  8.224720
Zimbabwe     0.27100  0.35041  0.375847  0.357  0.366  7.865712

>>> rank_all.pivot_table(values="Economy", columns="Year",
...:                      index="Region", aggfunc="mean")
Year                                 2015      2016  ...      2019       2020
Region                                               ...                     
Australia and New Zealand        1.291880  1.402545  ...  1.337500  10.610770
Central and Eastern Europe       0.943293  1.048545  ...  1.022429   9.662970
Eastern Asia                     1.058415  1.188970  ...  1.151250  10.047753
Latin America and Caribbean      0.854139  0.959183  ...  0.908950   9.255175
...

>>> rank_all.pivot_table(values="Economy", columns="Year",
...:    index="Country", margins= True, margins_name="sum")
Year             2015      2016      2017  ...      2019      2020       sum
Country                                    ...                              
Afghanistan  0.319820  0.382270  0.401477  ...  0.350000  7.462861  1.541405
Albania      0.878670  0.955300  0.996193  ...  0.947000  9.417931  2.351849
              ...       ...       ...  ...       ...       ...       ...
Zambia       0.470380  0.612020  0.636407  ...  0.578000  8.224720  1.847254
Zimbabwe     0.271000  0.350410  0.375847  ...  0.366000  7.865712  1.597661
sum          0.856583  0.959581  0.998364  ...  0.925674  9.343750  2.331606
                        
Batch Data Visualization

import os
import pandas as pd
import matplotlib.pyplot as plt

# 提取文件名和文件内容
os.chdir('happiness')
current_path = os.getcwd()
files=os.listdir()
df=[]
for filename in files:
    dot_position = filename.find('.')
    file_key = filename[:dot_position]
    t = (pd.read_csv(os.path.join(current_path,filename)), int(file_key))
    df.append(t)
os.chdir('..')

# 每个年份的表格拼接
table_all =[]
year_all=[]
for table, year in df:
    table_all.append(table)
    year_all.append(year)
rank_all = pd.concat(table_all, keys=year_all,join='inner')

# 计算每个年份,每个区域的排名平均值
r_all=[]
for i in year_all:
    r_all.append(rank_all.loc[i].groupby('Region')\
    ['Happiness Rank'].mean().sort_values(ascending=False))

# 创建figures文件夹用来放图片
exist=os.path.exists('figures')  
if not exist:
    os.mkdir('figures')
os.chdir('figures')

#批量生成图片,注意结尾处要close
plt.style.use('ggplot')
for j in range(len(r_all)):
    plt.barh(r_all[j].index, r_all[j].values)
    plt.title(str(year_all[j]), loc='center')
    plt.xticks(rotation=45, fontsize=8)
    plt.yticks(fontsize=8)
    plt.ylabel("Region", fontsize=10)
    plt.xlabel("Avg. Rank", fontsize=10)
    plt.savefig(str(year_all[j])+'.jpg',dpi=300,bbox_inches='tight')
    plt.close()

Summary

  • Pandas
    • Reading: Python for Data Analysis, Chapter 8, 10