單元摘要 :學習使用R語言來進行一系列數據圖像化
身為商管人才,我們要告訴客戶、上級主管,甚至投資人進行數據的闡述,這些密密麻麻的數據資料並不是能夠讓人為之一亮的,也甚至因為過於龐大複雜,而使有內涵的數據被遺忘,因此我們就必須將這些資料「視覺化」。 這邊我們要教你們在資料經過清洗、篩選後,該如何將資料「視覺化」,R語言與其他程式語言像Excel、Python一樣,基本數據操作、清理外,R語言視覺化工具可說是強者中的強者。
我們在這邊先教大家最家喻戶曉的圖像化工具,分別是ggplot2、d3heatmap套件。
#rm(list=ls())
pacman::p_load(devtools,dplyr, ggplot2, readr, plotly, googleVis,ggthemes,d3heatmap,magrittr)
## Installing package into '/home/b041010004/R/x86_64-redhat-linux-gnu-library/3.4'
## (as 'lib' is unspecified)
##
## ggthemes installed
## Installing package into '/home/b041010004/R/x86_64-redhat-linux-gnu-library/3.4'
## (as 'lib' is unspecified)
##
## d3heatmap installed
#setwd("~/camp")
#1.
athlete_all<- read_csv('asset/athlete_all.csv')
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_integer(),
## country = col_character(),
## year = col_integer(),
## ID = col_integer(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_integer(),
## Height = col_integer(),
## NOC = col_character(),
## Games = col_character(),
## Season = col_character(),
## City = col_character(),
## Sport = col_character(),
## Event = col_character(),
## Medal = col_character(),
## food_suppiy = col_integer(),
## income_GDP = col_integer(),
## democracy_score = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 42 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual file expected <int> <chr> <chr> <chr> <chr> actual 1 1166 female_school no trailing characters r 'asset/athlete_all.csv' file 2 1167 female_school no trailing characters r 'asset/athlete_all.csv' row 3 1168 female_school no trailing characters r 'asset/athlete_all.csv' col 4 1169 female_school no trailing characters r 'asset/athlete_all.csv' expected 5 1170 female_school no trailing characters r 'asset/athlete_all.csv'
## ... ................. ... ........................................................................... ........ ........................................................................... ...... ........................................................................... .... ........................................................................... ... ........................................................................... ... ........................................................................... ........ ...........................................................................
## See problems(...) for more details.
#2
a<-athlete_all[,c(-1,-17:-29)]
#3
a$Medal[is.na(a[,15])==T]=0
a$Medal<-factor(a$Medal,levels = c(0,"Bronze","Silver","Gold"))
a<- na.omit(a)
colnames(a)[1:2]<-c("Team","Year")
#4
#我設定的亞洲五國以及美洲五國,後續想探討這兩組國家來進行組內比較
asian<-c("China","Korea","Japan","India","Chinese Taipei","North Korea")
america<-c("United States","Mexico","Brazil","Canada","Argentina")
#篩選出歷年拿牌最多的6個國家
top<-a %>% group_by(Team) %>% summarize(medal=sum(as.double(Medal))) %>% arrange(medal)%>% tail()
library(googleVis)
#op = options(gvis.plot.tag='chart')
a1<-athlete_all %>% na.omit() %>% group_by(country,year,ce_rate,electricity,female_school,food_suppiy,health_expense,income_GDP,baby_pw,co2_em,child_mortality,inflation,unemployment_15,bc_w_death,democracy_score ) %>% summarise(Age=mean(Age),Height=mean(Height),Weight=mean(Weight))
gvisMotionChart(a1, "country", "year") %>% plot
## starting httpd help server ... done
1.ggplot內容可以像洋蔥一樣一層一層的來劃分:
- 資料來源(data):指定原始資料來源的 data frame。
- 美學對應(aesthetic):指定原始資料與圖形之間的對應關係,例如哪一個變數要當作x座標變數,而哪一個要當作y座標變數 ,還有資料繪圖時的樣式等。
- 幾何圖案(geometry):要用什麼幾何圖形繪製資料,例如點、線條、多邊形等。
- 繪圖面(facet):指定如何將資料分散在多張子圖形中繪製,以利互相比較。
- 統計轉換(statistical transformation):指定如何以將資料轉換為各種統計量,例如將連續型資料轉為離散型的類別。
- 座標系統(coordinate system):指定繪圖時所使用的座標系統,除了常見的笛卡兒直角座標系統,也可以使用極坐標或地圖投影(map projection)。
- 主題(theme):控制資料以外的繪圖組件,例如座標軸、說明文字等。
資料來源:https://blog.gtwang.org/r/ggplot2-tutorial-basic-concept-and-qplot/
2.函式表現
ggplot( data ,aes(x = __ , y = __ ,...))
+geom_[chart]()
aes()
負責將素材綁定至 X 軸與 Y 軸,這是 aesthetic mappings 的縮寫;接著我們還要告訴 ggplot 物件要用什麼樣的外觀呈現,這時接著利用 geom_()
指定繪圖的類型,這是 geometric objects 的縮寫,底線後面加上繪圖名稱。
#奧運參賽選走的體重與年齡點狀分布
plot(a$Age,a$Weight)
#奧運女子一百公尺田徑項目得獎之選手身高、體重點狀分布
a1<-a%>%filter(a$Medal!=0, a$Event=="Athletics Women's 100 metres")%>%
ggplot(aes(Weight,Height))+geom_point(col="#007799")+
theme_economist() + scale_color_economist()
ggplotly(a1)
#奧運男子一百公尺田徑項目得獎之選手身高、體重點狀分布
a1<-a%>%filter(a$Medal!=0,a$Event=="Athletics Men's 100 metres")%>%
ggplot(aes(Weight,Height))+geom_point(col="#007799")+
theme_economist() + scale_color_economist()
ggplotly(a1)
2.而弟弟韓火想知道,在2000年以後,全世界各國的「Income_GDP」(x軸)與「health_expense」(y軸)之間的關係,請以點狀圖表示之。
#簡易版(全體參賽者的體重)
boxplot(a$Weight)
#ggplot版
#亞洲五國的身高分布
a1<-a%>%filter(Team%in%asian) %>%
ggplot(aes(Team,Height))+geom_boxplot(fill="#007799",col="black")+
theme_economist() + scale_color_economist()
ggplotly(a1)
#美洲五國的身高分布
a1<-a%>%filter(Team%in%america) %>%
ggplot(aes(Team,Height))+geom_boxplot(fill="#007799",col="black")+
theme_economist() + scale_color_economist()
ggplotly(a1)
2.他還想知道在夏日以及冬至的奧運選手體重的盒鬚圖,你願意像哆啦A夢一樣一而再,再而三地幫他嘛?霸脫!
#亞洲五國的參賽奧運人次比較
a1<-a%>%filter(a$Team%in%asian)%>%ggplot(aes(Team ))+geom_bar(fill="#007799")+
theme_economist() + scale_color_economist()
ggplotly(a1)
#美洲五國參賽奧運人次比較
a1<-a%>%filter(a$Team%in%america)%>%ggplot(aes(Team ))+geom_bar(fill="#007799")+
theme_economist() + scale_color_economist()
ggplotly(a1)
2.另外,柔道校隊的他也想知道亞洲五大國之奪牌選手中,每個項目數量個別是多少?
#亞洲五國各國得牌數量比較
a1<-a%>%filter(a$Team%in%asian)%>%
group_by(Team,Medal) %>% summarize(medal=sum(as.double(Medal))) %>%
ggplot(aes(Team,medal,fill=Medal ))+geom_col()+
theme_economist() + scale_color_economist()
ggplotly(a1)
#美洲五國各國得牌數量比較
a1<-a%>%filter(a$Team%in%america)%>%
group_by(Team,Medal) %>% summarize(medal=sum(as.double(Medal))) %>%
ggplot(aes(Team,medal,fill=Medal ))+geom_col()+
theme_economist() + scale_color_economist()
ggplotly(a1)
直條圖Igeom_bar
與直條圖IIgeom_col
差異:
ANSWER: bar的y軸只能放count,col的y軸則沒有限定。
#全體參賽者身高直方圖
a1<-ggplot(a,aes(Height))+geom_histogram(fill="#007799")+
theme_economist() + scale_color_economist()
ggplotly(a1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#全體參賽者體重直方圖
a1<-ggplot(a,aes(Weight))+geom_histogram(fill="#007799")+
theme_economist() + scale_color_economist()
ggplotly(a1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#全體參賽者年齡直方圖
a1<-ggplot(a,aes(Age))+geom_histogram(fill="#007799")+
theme_economist() + scale_color_economist()
ggplotly(a1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#全體參賽者身高直方與機率密度圖
a1<-ggplot(a,aes(Height))+
geom_histogram(bins = 30, aes(y = ..density..), alpha = 0.5,fill="#007799")+
theme_economist() + scale_color_economist()+geom_density()
ggplotly(a1)
#全體參賽者體重直方與機率密度圖
a1<-ggplot(a,aes(Weight))+
geom_histogram(bins = 30, aes(y = ..density..), alpha = 0.5,fill="#007799")+
theme_economist() + scale_color_economist()+geom_density()
ggplotly(a1)
#亞洲五國於"夏季"奧運總得牌數年度變化
a1<-a%>%filter(a$Team%in%asian,a$Medal!=0,a$Season=="Summer")%>%
group_by(Year,Team) %>% summarize(medal=sum(as.double(Medal))) %>%
ggplot(aes(Year,medal ,col=Team))+geom_line()+
theme_economist() + scale_color_economist()+geom_point()
ggplotly(a1)
#美洲五國於"夏季"奧運總得牌數年度變化
a1<-a%>%filter(a$Team%in%america,a$Medal!=0,a$Season=="Summer")%>%
group_by(Year,Team) %>% summarize(medal=sum(as.double(Medal))) %>%
ggplot(aes(Year,medal ,col=Team))+geom_line()+theme_economist() +
scale_color_economist()+geom_point()
ggplotly(a1)
#最多獎牌的前六名國家每年夏季奧運得獎數比較
a1<-a%>%filter(a$Team%in%top$Team,a$Medal!=0,a$Season=="Summer")%>%
group_by(Year,Team) %>% summarize(medal=sum(as.double(Medal))) %>%
ggplot(aes(Year,medal ,col=Team))+geom_line()+
theme_economist() + scale_color_economist()+geom_point()
ggplotly(a1)
<基本套件plot與ggplot2中的qplot之間差異> R語言除了此套件外,也有基本的內建視覺化工具「plot」函數,雖然他也能夠化特定的圖形,但是美化方面以及進階功能方面,遠遠不及GGPLOT2套件。
覺得ggplot2很多東西要打,很麻煩對吧,ggplot2套件中還有一個形式可以提供你輸出圖表,就是qplot函數。
plot(x,y,type=,...)
qplot(x, y, ..., data, facets = NULL, margins = FALSE, geom = "auto",...)
qplot 有提供一些參數可以讓使用者更改資料點的顏色、大小等樣式,而且在使用上會比傳統的 plot 函數更方便,在使用 plot 更改資料點的樣式時,使用者必須自行將類別型的資料轉換為 plot 可以接受的數值或名稱(例如 red、blue 等),而 qplot 則是可以自動處理這些繁瑣的動作,並且在圖形上加入圖示說明(legend)
plot(a$Age,a$Weight)
qplot(a$Age,a$Weight,color=a$Sex)
ggplot還有很多奇奇怪怪的配置可以供使用,但我們在這邊只先講較常碰到的使用。 ####1.size,col
#Alpine skiing 賽事中奪牌者之身高、體重、性別、與年紀點狀圖
a1<-a %>%filter(a$Medal!=0,a$Sport=="Alpine Skiing") %>%
ggplot(aes(Height,Weight,colour=Sex,size=Age)) + geom_point()
ggplotly(a1)
#亞洲五國1996年以後各國年齡分布比較
a %>% filter(Team%in%asian,Year>=1996) %>%
ggplot(aes(Age))+geom_bar(fill="#007799")+
theme_economist() + scale_color_economist() +
facet_wrap(~Team)
a %>% filter(Team%in%asian,Year>=1996) %>%
ggplot(aes(Age))+geom_bar(fill="#007799")+
theme_economist() + scale_color_economist() +
coord_flip() + facet_wrap(~Team)
#美國2000年以後各奧運賽事"田徑"賽事選手年齡分布比較
a %>% filter(Team%in%"United States",Year>=2000,a$Sport=="Athletics") %>%
ggplot(aes(Age))+geom_histogram(fill="#007799")+
theme_economist() + scale_color_economist()+facet_wrap(~Year)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
腳踏車手的身高體重熱圖 !!!格式限制 matrix.data.frame
#在1956年以前,各項目奪獎選手之身高體重分布熱圖
#擷取年分與得獎者
heat<-a %>% filter(Year<=1956,Medal!=0);
#將連續變數變成類別變數
#身高
heat$Height<-cut(heat$Height,breaks=c(140,150,161,171,181,191,201,211,221),labels=c("140-150","151~160","161~170","171~180","181~190","191~200","201~210","211~220"))
#體重
heat$Weight<-cut(heat$Weight,breaks=c(40,55,70,85,100,115,130,145),labels=c("40-55","56-70","71-85","86-100","101-115","116-130","131-145"))
#建立矩陣
heat1<-table(heat$Height,heat$Weight)%>%
t() %>% {./rowSums(.)}%>%t() %>%
as.data.frame.matrix(dimnames(list(c(as.character(unique(a$Height))),c(as.character( unique(a$Weight))))),rownames.force=T,colnames=c(as.character( unique(a$Weight))))
#畫圖
d3heatmap(heat1,F,F,colors = "Blues",yaxis_width=120,yaxis_font_size = 12)
#1956以後,各項目奪獎選手之身高體重分布熱圖
#擷取年分與得獎者
heat2<-a %>% filter(Year>1956,Medal!=0);
#將連續變數變成類別變數
#身高
heat2$Height<-cut(heat2$Height,breaks=c(140,150,161,171,181,191,201,211,221),labels=c("140-150","151~160","161~170","171~180","181~190","191~200","201~210","211~220"))
#體重
heat2$Weight<-cut(heat2$Weight,breaks=c(40,55,70,85,100,115,130,145),labels=c("40-55","56-70","71-85","86-100","101-115","116-130","131-145"))
#建立矩陣
heat3<-table(heat2$Height,heat2$Weight)%>%
t() %>% {./rowSums(.)}%>%t() %>%
as.data.frame.matrix(dimnames(list(c(as.character(unique(a$Height))),c(as.character( unique(a$Weight))))),rownames.force=T,colnames=c(as.character( unique(a$Weight))))
#畫圖
d3heatmap(heat3,F,F,colors = "Blues",yaxis_width=120,yaxis_font_size = 12)
#得過獎的國家以及運動項目熱圖(檢測某運動項目的奪牌常勝軍)
TS<-a %>% filter(a$Medal!=0)
#數量落差太大 因此去除第一名第二名
T30T<-table(TS$Team) %>%sort %>% tail(32) %>% as.data.frame();T30T<-T30T[c(-32,-31),]
#比賽項目前30多的運動
T30S<-table(TS$Sport) %>% sort %>% tail(30) %>% as.data.frame()
#做圖
TS1<-TS %>% filter(TS$Team%in% T30T$Var1,TS$Sport%in%T30S$Var1)
table(TS1$Team,TS1$Sport)%>%t() %>% {./rowSums(.)}%>%t() %>% as.data.frame.matrix %>% d3heatmap(colors="Blues",yaxis_width=120,yaxis_font_size = 12)
#沒有去除第一二名的情況
T30T<-table(TS$Team) %>%sort %>% tail(30) %>% as.data.frame()
T30S<-table(TS$Sport) %>% sort %>% tail(30) %>% as.data.frame()
TS1<-TS %>% filter(TS$Team%in% T30T$Var1,TS$Sport%in%T30S$Var1 )
table(TS1$Team,TS1$Sport)%>%t() %>% {./rowSums(.)}%>%t() %>% as.data.frame.matrix %>% d3heatmap(colors="Blues",yaxis_width=120,yaxis_font_size = 12)
Base tools: plot(x,y,...)
“p” for points, “l” for lines, “h” for ‘histogram’ like (or ‘high-density’) vertical lines,
GGPlot2 Packages: ggplot(data,aes(x = __ , y = __ ,...))
+geom_[chart]()
qplot(x,y,color,facets=,...)
d3heatmap(matrix)