研究問題與動機描述

聽到閃電波特退休的消息,聯想到年紀越來越大容易造成運動選手身體各個層面的衰老,進而導致比賽的失利而無法取得獎牌。使我們想要探討人的生理年齡是否會大幅度的影響得到獎牌的機率。

讀取套件及資料

library(readr)
athlete<-read_csv("../asset/athlete_all.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   country = col_character(),
##   Name = col_character(),
##   Sex = col_character(),
##   NOC = col_character(),
##   Games = col_character(),
##   Season = col_character(),
##   City = col_character(),
##   Sport = col_character(),
##   Event = col_character(),
##   Medal = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 42 parsing failures.
##  row           col               expected actual                       file
## 1166 female_school no trailing characters      r '../asset/athlete_all.csv'
## 1167 female_school no trailing characters      r '../asset/athlete_all.csv'
## 1168 female_school no trailing characters      r '../asset/athlete_all.csv'
## 1169 female_school no trailing characters      r '../asset/athlete_all.csv'
## 1170 female_school no trailing characters      r '../asset/athlete_all.csv'
## .... ............. ...................... ...... ..........................
## See problems(...) for more details.
pacman::p_load(devtools,dplyr, ggplot2, readr, plotly, googleVis,ggthemes,d3heatmap,magrittr)

資料探索(敘述性統計)

summary(athlete) 
##        X1           country               year            ID        
##  Min.   :     1   Length:271116      Min.   :1896   Min.   :     1  
##  1st Qu.: 67780   Class :character   1st Qu.:1960   1st Qu.: 34643  
##  Median :135559   Mode  :character   Median :1988   Median : 68205  
##  Mean   :135559                      Mean   :1978   Mean   : 68249  
##  3rd Qu.:203337                      3rd Qu.:2002   3rd Qu.:102097  
##  Max.   :271116                      Max.   :2016   Max.   :135571  
##                                                                     
##      Name               Sex                 Age            Height     
##  Length:271116      Length:271116      Min.   :10.00   Min.   :127.0  
##  Class :character   Class :character   1st Qu.:21.00   1st Qu.:168.0  
##  Mode  :character   Mode  :character   Median :24.00   Median :175.0  
##                                        Mean   :25.56   Mean   :175.3  
##                                        3rd Qu.:28.00   3rd Qu.:183.0  
##                                        Max.   :97.00   Max.   :226.0  
##                                        NA's   :9474    NA's   :60171  
##      Weight          NOC               Games              Season         
##  Min.   : 25.0   Length:271116      Length:271116      Length:271116     
##  1st Qu.: 60.0   Class :character   Class :character   Class :character  
##  Median : 70.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 70.7                                                           
##  3rd Qu.: 79.0                                                           
##  Max.   :214.0                                                           
##  NA's   :62875                                                           
##      City              Sport              Event              Medal          
##  Length:271116      Length:271116      Length:271116      Length:271116     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     ce_rate        electricity        female_school     food_suppiy    
##  Min.   : 17.00   Min.   :3.500e+09   Min.   : 15.40   Min.   :1440    
##  1st Qu.: 47.60   1st Qu.:7.350e+10   1st Qu.: 96.50   1st Qu.:2890    
##  Median : 51.70   Median :2.260e+11   Median :100.00   Median :3130    
##  Mean   : 55.43   Mean   :6.796e+11   Mean   : 96.84   Mean   :3090    
##  3rd Qu.: 58.20   3rd Qu.:6.020e+11   3rd Qu.:103.00   3rd Qu.:3390    
##  Max.   :118.00   Max.   :6.140e+12   Max.   :130.00   Max.   :3810    
##  NA's   :83417    NA's   :166832      NA's   :131069   NA's   :122790  
##  health_expense     income_GDP        baby_pw          co2_em     
##  Min.   : 1.69    Min.   :   263   Min.   :1.12    Min.   : 0.01  
##  1st Qu.: 5.71    1st Qu.:  7390   1st Qu.:1.61    1st Qu.: 3.04  
##  Median : 7.76    Median : 14800   Median :2.04    Median : 6.48  
##  Mean   : 7.88    Mean   : 19366   Mean   :2.52    Mean   : 7.34  
##  3rd Qu.: 9.49    3rd Qu.: 30100   3rd Qu.:2.87    3rd Qu.:10.00  
##  Max.   :20.60    Max.   :136000   Max.   :8.30    Max.   :61.80  
##  NA's   :206269   NA's   :45550    NA's   :46166   NA's   :59753  
##  child_mortality    inflation       unemployment_15    bc_w_death    
##  Min.   :  2.10   Min.   : -17.10   Min.   : 0.14    Min.   : 0.53   
##  1st Qu.:  7.60   1st Qu.:   1.67   1st Qu.: 4.65    1st Qu.:15.50   
##  Median : 19.60   Median :   3.92   Median : 6.83    Median :20.20   
##  Mean   : 52.64   Mean   :  17.17   Mean   : 7.56    Mean   :19.48   
##  3rd Qu.: 60.60   3rd Qu.:   8.46   3rd Qu.: 9.62    3rd Qu.:23.80   
##  Max.   :536.00   Max.   :6040.00   Max.   :35.50    Max.   :46.60   
##  NA's   :45763    NA's   :115730    NA's   :154684   NA's   :103013  
##  democracy_score 
##  Min.   :-10.00  
##  1st Qu.:  3.00  
##  Median : 10.00  
##  Mean   :  5.43  
##  3rd Qu.: 10.00  
##  Max.   : 10.00  
##  NA's   :79678

資料處理(dplyr)

a<-athlete[,c(-1)]

a$Medal[is.na(a[,15])==T]=0
a$Medal<-factor(a$Medal,levels = c(0,"Bronze","Silver","Gold"))
colnames(a)[1:2]<-c("Team","Year")

資料視覺化

a1<-a%>%         
  na.omit() %>%  #刪掉空值
  group_by(Age,Medal) %>%  #用年齡、獎牌分組
  summarize(medal=sum(as.double(Medal))) %>%  #總結獎牌數
  ggplot(aes(Age,medal,fill=Medal ))+geom_col()+  #畫圖
  theme_economist() + scale_color_economist()
ggplotly(a1)

資料視覺化

demo <- a %>% 
  select(Team, Year, Sex, democracy_score) %>% 
  na.omit() %>%
  group_by(Year) %>% 
  mutate(nPeople = n()) %>%
  filter(Sex == "F") %>% 
  mutate(nFemale = n()) %>% 
  mutate(pFemale = nFemale/nPeople) %>% 
  distinct(Team, Year, democracy_score, pFemale) %>% 
  ungroup() %>% 
  group_by(Year) %>% 
  mutate(mean_demo = mean(democracy_score)) %>% 
  distinct(Year, pFemale, mean_demo) %>% 
  arrange(Year)

ggplot(aes(Year, pFemale, mean_demo), data = demo)+
  geom_line()+theme_economist() + scale_color_economist()

ggplot(aes(Year, mean_demo), data = demo)+
  geom_line()+theme_economist() + scale_color_economist()

結論與洞察

  • 23歲是運動員得到獎牌的高峰。
  • 28歲以後得獎機率開始有明顯的下滑。