基本的程式筆記設定,安裝、載入一些基本的套件
rm(list=ls(all=T))
knitr::opts_chunk$set(comment = NA)
knitr::opts_knit$set(global.par = TRUE)
options(scipen=20, digits=4, width=90)
if(!require(pacman)) install.packages("pacman")
pacman::p_load(magrittr)
以上這些程式碼請大家不要去改動read.csv()
WHO = read.csv("data/WHO.csv")
str()
str(WHO)
'data.frame': 194 obs. of 13 variables:
$ Country : chr "Afghanistan" "Albania" "Algeria" "Andorra" ...
$ Region : chr "Eastern Mediterranean" "Europe" "Africa" "Europe" ...
$ Population : int 29825 3162 38482 78 20821 89 41087 2969 23050 8464 ...
$ Under15 : num 47.4 21.3 27.4 15.2 47.6 ...
$ Over60 : num 3.82 14.93 7.17 22.86 3.84 ...
$ FertilityRate : num 5.4 1.75 2.83 NA 6.1 2.12 2.2 1.74 1.89 1.44 ...
$ LifeExpectancy : int 60 74 73 82 51 75 76 71 82 81 ...
$ ChildMortality : num 98.5 16.7 20 3.2 163.5 ...
$ CellularSubscribers : num 54.3 96.4 99 75.5 48.4 ...
$ LiteracyRate : num NA NA NA NA 70.1 99 97.8 99.6 NA NA ...
$ GNI : num 1140 8820 8310 NA 5230 ...
$ PrimarySchoolEnrollmentMale : num NA NA 98.2 78.4 93.1 91.1 NA NA 96.9 NA ...
$ PrimarySchoolEnrollmentFemale: num NA NA 96.4 79.4 78.2 84.5 NA NA 97.5 NA ...
head()
, tail()
head(WHO)
Country Region Population Under15 Over60 FertilityRate
1 Afghanistan Eastern Mediterranean 29825 47.42 3.82 5.40
2 Albania Europe 3162 21.33 14.93 1.75
3 Algeria Africa 38482 27.42 7.17 2.83
4 Andorra Europe 78 15.20 22.86 NA
5 Angola Africa 20821 47.58 3.84 6.10
6 Antigua and Barbuda Americas 89 25.96 12.35 2.12
LifeExpectancy ChildMortality CellularSubscribers LiteracyRate GNI
1 60 98.5 54.26 NA 1140
2 74 16.7 96.39 NA 8820
3 73 20.0 98.99 NA 8310
4 82 3.2 75.49 NA NA
5 51 163.5 48.38 70.1 5230
6 75 9.9 196.41 99.0 17900
PrimarySchoolEnrollmentMale PrimarySchoolEnrollmentFemale
1 NA NA
2 NA NA
3 98.2 96.4
4 78.4 79.4
5 93.1 78.2
6 91.1 84.5
summary()
summary(WHO)
Country Region Population Under15 Over60
Length:194 Length:194 Min. : 1 Min. :13.1 Min. : 0.81
Class :character Class :character 1st Qu.: 1696 1st Qu.:18.7 1st Qu.: 5.20
Mode :character Mode :character Median : 7790 Median :28.6 Median : 8.53
Mean : 36360 Mean :28.7 Mean :11.16
3rd Qu.: 24535 3rd Qu.:37.8 3rd Qu.:16.69
Max. :1390000 Max. :50.0 Max. :31.92
FertilityRate LifeExpectancy ChildMortality CellularSubscribers LiteracyRate
Min. :1.26 Min. :47.0 Min. : 2.20 Min. : 2.57 Min. :31.1
1st Qu.:1.83 1st Qu.:64.0 1st Qu.: 8.43 1st Qu.: 63.57 1st Qu.:71.6
Median :2.40 Median :72.5 Median : 18.60 Median : 97.75 Median :91.8
Mean :2.94 Mean :70.0 Mean : 36.15 Mean : 93.64 Mean :83.7
3rd Qu.:3.90 3rd Qu.:76.0 3rd Qu.: 55.98 3rd Qu.:120.81 3rd Qu.:97.8
Max. :7.58 Max. :83.0 Max. :181.60 Max. :196.41 Max. :99.8
NA's :11 NA's :10 NA's :91
GNI PrimarySchoolEnrollmentMale PrimarySchoolEnrollmentFemale
Min. : 340 Min. : 37.2 Min. : 32.5
1st Qu.: 2335 1st Qu.: 87.7 1st Qu.: 87.3
Median : 7870 Median : 94.7 Median : 95.1
Mean :13321 Mean : 90.8 Mean : 89.6
3rd Qu.:17558 3rd Qu.: 98.1 3rd Qu.: 97.9
Max. :86440 Max. :100.0 Max. :100.0
NA's :32 NA's :93 NA's :93
15歲以下人口比率 mean()
, sd()
mean(WHO$Under15)
[1] 28.73
sd(WHO$Under15)
[1] 10.53
summary(WHO$Under15)
Min. 1st Qu. Median Mean 3rd Qu. Max.
13.1 18.7 28.6 28.7 37.8 50.0
15歲以下人口比率最低的國家 which()
, which.min()
, which.max
which.min(WHO$Under15)
[1] 86
WHO$Country[86]
[1] "Japan"
WHO$Country[which.min(WHO$Under15)]
[1] "Japan"
15歲以下人口比率最高的國家
which.max(WHO$Under15)
[1] 124
WHO$Country[124]
[1] "Niger"
WHO$Country[which.max(WHO$Under15)]
[1] "Niger"
subset()
. nrow()
Outliers = subset(WHO, GNI > 10000 & FertilityRate > 2.5)
nrow(Outliers)
[1] 7
Outliers[c("Country","GNI","FertilityRate")]
Country GNI FertilityRate
23 Botswana 14550 2.71
56 Equatorial Guinea 25620 5.04
63 Gabon 13740 4.18
83 Israel 27110 2.92
88 Kazakhstan 11250 2.52
131 Panama 14510 2.52
150 Saudi Arabia 24700 2.76
plot()
觀察兩變數之間的關係
par(cex=0.8, mar=c(4,4,2,1))
plot(WHO$GNI, WHO$FertilityRate)
abline(v=10000, h=2.5, col='red')
table()
table(WHO$Region)
Africa Americas Eastern Mediterranean Europe
46 35 22 53
South-East Asia Western Pacific
11 27
table()
觀察『類別』的頻率
par(mar=c(12,3,4,2),cex=0.8)
table(WHO$Region) %>% barplot(las=2, main="No. Country in Each Region")
hist()
觀察『數量』的分布
hist(WHO$CellularSubscribers)
boxplot
比較各『類別』的『數量』分布
boxplot(WHO$LifeExpectancy ~ WHO$Region)
boxplot(WHO$LifeExpectancy ~ WHO$Region,
xlab = "", ylab = "Life Expectancy",
main = "Life Expectancy of Countries by Region")
tapply()
tapply(WHO$Over60, WHO$Region, mean)
Africa Americas Eastern Mediterranean Europe
5.221 10.944 5.620 19.775
South-East Asia Western Pacific
8.769 10.163
tapply(WHO$LiteracyRate, WHO$Region, min)
Africa Americas Eastern Mediterranean Europe
NA NA NA NA
South-East Asia Western Pacific
NA NA
tapply(WHO$LiteracyRate, WHO$Region, min, na.rm=TRUE)
Africa Americas Eastern Mediterranean Europe
31.1 75.2 63.9 95.2
South-East Asia Western Pacific
56.8 60.6
USDA = read.csv("data/USDA.csv")
summary(USDA)
ID Description Calories Protein TotalFat
Min. : 1001 Length:7058 Min. : 0 Min. : 0.00 Min. : 0.00
1st Qu.: 8387 Class :character 1st Qu.: 85 1st Qu.: 2.29 1st Qu.: 0.72
Median :13294 Mode :character Median :181 Median : 8.20 Median : 4.37
Mean :14260 Mean :220 Mean :11.71 Mean : 10.32
3rd Qu.:18337 3rd Qu.:331 3rd Qu.:20.43 3rd Qu.: 12.70
Max. :93600 Max. :902 Max. :88.32 Max. :100.00
NA's :1 NA's :1 NA's :1
Carbohydrate Sodium SaturatedFat Cholesterol Sugar
Min. : 0.00 Min. : 0 Min. : 0.00 Min. : 0.0 Min. : 0.0
1st Qu.: 0.00 1st Qu.: 37 1st Qu.: 0.17 1st Qu.: 0.0 1st Qu.: 0.0
Median : 7.13 Median : 79 Median : 1.26 Median : 3.0 Median : 1.4
Mean : 20.70 Mean : 322 Mean : 3.45 Mean : 41.6 Mean : 8.3
3rd Qu.: 28.17 3rd Qu.: 386 3rd Qu.: 4.03 3rd Qu.: 69.0 3rd Qu.: 7.9
Max. :100.00 Max. :38758 Max. :95.60 Max. :3100.0 Max. :99.8
NA's :1 NA's :84 NA's :301 NA's :288 NA's :1910
Calcium Iron Potassium VitaminC VitaminE
Min. : 0 Min. : 0.00 Min. : 0 Min. : 0.0 Min. : 0.0
1st Qu.: 9 1st Qu.: 0.52 1st Qu.: 135 1st Qu.: 0.0 1st Qu.: 0.1
Median : 19 Median : 1.33 Median : 250 Median : 0.0 Median : 0.3
Mean : 74 Mean : 2.83 Mean : 301 Mean : 9.4 Mean : 1.5
3rd Qu.: 56 3rd Qu.: 2.62 3rd Qu.: 348 3rd Qu.: 3.1 3rd Qu.: 0.7
Max. :7364 Max. :123.60 Max. :16500 Max. :2400.0 Max. :149.4
NA's :136 NA's :123 NA's :409 NA's :332 NA's :2720
VitaminD
Min. : 0.0
1st Qu.: 0.0
Median : 0.0
Mean : 0.6
3rd Qu.: 0.1
Max. :250.0
NA's :2834
鹽分含量: 當資料很大的時候 head()
, range()
, summary()
, hist()
nrow(USDA)
[1] 7058
head(USDA$Sodium, 300)
[1] 714 827 2 1395 560 629 842 690 621 700 604 364 344 330
[15] 330 406 321 965 1116 800 600 819 336 800 536 627 415 619
[29] 652 628 334 1529 1602 534 876 84 125 1200 1809 192 753 1671
[43] 1428 1370 966 1284 1552 1625 41 40 34 38 8 89 80 54
[57] 48 57 57 79 79 181 122 66 62 25 102 78 43 49
[71] 47 52 59 44 52 58 42 53 59 105 3 371 535 549
[85] 2280 517 127 106 115 60 66 61 44 50 17 52 44 111
[99] 95 48 968 54 1079 46 70 77 66 53 58 65 142 166
[113] 48 67 67 207 124 155 297 145 523 548 1156 1238 135 146
[127] 138 141 151 800 11 1696 42 59 106 535 549 346 515 368
[141] 3780 685 132 493 1131 655 617 612 21 133 158 1280 47 44
[155] 70 71 141 725 59 1150 702 135 1638 1705 1298 1499 1586 59
[169] 72 52 45 55 45 100 72 66 58 1529 80 196 876 776
[183] 576 43 371 106 1587 53 58 58 58 66 59 135 329 250
[197] 129 199 1808 751 704 105 66 81 61 68 129 162 94 146
[211] 164 110 92 92 696 1272 1671 1284 3663 77 16 76 23 17
[225] 18 160 83 1640 10 277 211 35 168 52 20 208 88 67
[239] 60 27 80 77 13 16 73 25 68 452 20 30 5 26
[253] 27 52 50 148 11 24 62 55 38 4 61 1135 38758 5
[267] 9 9 4 3 2 2964 314 26 31 30 344 8 23 8068
[281] 14 227 260 52 173 36 36 25 0 0 633 750 788 1133
[295] 1000 863 711 838 1074 868
range(USDA$Sodium, na.rm=T)
[1] 0 38758
summary(USDA$Sodium)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0 37 79 322 386 38758 84
hist(USDA$Sodium)
hist(log(USDA$Sodium))
鹽分含量最高的食物
# Finding the index of the food with highest sodium levels
which.max(USDA$Sodium)
[1] 265
# Get the name of the food with highest sodium levels
USDA$Description[265]
[1] "SALT,TABLE"
高鹽食物
# Create a subset of the foods with sodium content above 10,000mg
HighSodium = subset(USDA, Sodium>10000)
# Count the number of rows, or observations
nrow(HighSodium)
[1] 10
# Output names of the foods with high sodium content
HighSodium$Description
[1] "SALT,TABLE"
[2] "SOUP,BF BROTH OR BOUILLON,PDR,DRY"
[3] "SOUP,BEEF BROTH,CUBED,DRY"
[4] "SOUP,CHICK BROTH OR BOUILLON,DRY"
[5] "SOUP,CHICK BROTH CUBES,DRY"
[6] "GRAVY,AU JUS,DRY"
[7] "ADOBO FRESCO"
[8] "LEAVENING AGENTS,BAKING PDR,DOUBLE-ACTING,NA AL SULFATE"
[9] "LEAVENING AGENTS,BAKING SODA"
[10] "DESSERTS,RENNIN,TABLETS,UNSWTND"
魚子醬 match()
# Finding the index of CAVIAR in the dataset
match("CAVIAR", USDA$Description)
[1] 4154
# Find amount of sodium in caviar
USDA$Sodium[4154]
[1] 1500
# Doing it in one command!
USDA$Sodium[match("CAVIAR", USDA$Description)]
[1] 1500
USDA$Sodium[ USDA$Description == "CAVIAR" ]
[1] 1500
統計量
# Summary function over Sodium vector
summary(USDA$Sodium)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0 37 79 322 386 38758 84
# Standard deviation
sd(USDA$Sodium, na.rm = TRUE)
[1] 1045
點狀圖:蛋白質 vs. 脂肪
# Scatter Plots
plot(USDA$Protein, USDA$TotalFat)
# Add xlabel, ylabel and title
plot(USDA$Protein, USDA$TotalFat,
xlab="Protein", ylab = "Fat",
main = "Protein vs Fat", col = "red")
直方圖:維他命C
# Creating a histogram
hist(USDA$VitaminC, xlab = "Vitamin C (mg)", main = "Histogram of Vitamin C")
# Add limits to x-axis
hist(USDA$VitaminC, xlab = "Vitamin C (mg)",
main = "Histogram of Vitamin C", xlim = c(0,100))
# Specify breaks of histogram
hist(USDA$VitaminC, xlab = "Vitamin C (mg)",
main = "Histogram of Vitamin C", xlim = c(0,100), breaks=100)
hist(USDA$VitaminC, xlab = "Vitamin C (mg)",
main = "Histogram of Vitamin C", xlim = c(0,100), breaks=2000)
盒狀圖:糖分
# Boxplots
boxplot(USDA$Sugar, ylab = "Sugar (g)", main = "Boxplot of Sugar")
# Creating a variable that takes value 1 if the food has higher sodium
HighSodium = as.numeric(USDA$Sodium > mean(USDA$Sodium, na.rm=TRUE))
# Adding the variable to the dataset
USDA$HighSodium = as.numeric(USDA$Sodium > mean(USDA$Sodium, na.rm=TRUE))
# Similarly for HighProtein, HigCarbs, HighFat
USDA$HighCarbs = as.numeric(USDA$Carbohydrate > mean(USDA$Carbohydrate, na.rm=TRUE))
USDA$HighProtein = as.numeric(USDA$Protein > mean(USDA$Protein, na.rm=TRUE))
USDA$HighFat = as.numeric(USDA$TotalFat > mean(USDA$TotalFat, na.rm=TRUE))
🗿 練習:
以下的運算式分別代表什麼意思 …
■ table(USDA$HighSodium)
■ table(USDA$HighSodium, USDA$HighFat)
■ tapply(USDA$Iron, USDA$HighProtein, mean, na.rm=TRUE)
■ tapply(USDA$VitaminC, USDA$HighCarbs, max, na.rm=TRUE)
■ tapply(USDA$VitaminC, USDA$HighCarbs, summary, na.rm=TRUE)
table(USDA$HighSodium)
0 1
4884 2090
table(USDA$HighSodium, USDA$HighFat)
0 1
0 3529 1355
1 1378 712
tapply(USDA$Iron, USDA$HighProtein, mean, na.rm=TRUE)
0 1
2.559 3.197
tapply(USDA$VitaminC, USDA$HighCarbs, max, na.rm=TRUE)
0 1
1678 2400
tapply(USDA$VitaminC, USDA$HighCarbs, summary, na.rm=TRUE)
$`0`
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0 0.0 0.0 6.4 2.8 1677.6 248
$`1`
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0 0.0 0.2 16.3 4.5 2400.0 83
📝 UNIT2B 學習重點:
■ 資料檢視功能:
§ 資料結構:str()
§ 資料總覽:summary()
§ 資料顯示:head()
, tail()
■ 基本統計量(Statistics)函數:
§ mean()
, median()
, sd()
, max()
, min()
■ 基本繪圖功能:
§ 類別變數的分布:直條圖 barplot()
§ 數量變數的分布:直方圖 hist()
§ 兩數量變數的關係:點狀圖 plot()
§ 比較各類別的數量分布:盒狀圖 boxplot()
■ 位置函數:
§ which()
, which.max()
, which.min()
■ 最重要的兩個功能:
§ 計算各分類的數量 table()
§ 計算各分類的統計量 tapply()