基本的程式筆記設定,安裝、載入一些基本的套件

rm(list=ls(all=T))
knitr::opts_chunk$set(comment = NA)
knitr::opts_knit$set(global.par = TRUE)
options(scipen=20, digits=4, width=90)
if(!require(pacman)) install.packages("pacman")
pacman::p_load(magrittr)
以上這些程式碼請大家不要去改動

【CASE 1】世界衛生組織資料集 WHO Dataset

載入資料 Loading Data read.csv()
WHO = read.csv("data/WHO.csv")
資料結構 Data Structure str()
str(WHO)
'data.frame':   194 obs. of  13 variables:
 $ Country                      : chr  "Afghanistan" "Albania" "Algeria" "Andorra" ...
 $ Region                       : chr  "Eastern Mediterranean" "Europe" "Africa" "Europe" ...
 $ Population                   : int  29825 3162 38482 78 20821 89 41087 2969 23050 8464 ...
 $ Under15                      : num  47.4 21.3 27.4 15.2 47.6 ...
 $ Over60                       : num  3.82 14.93 7.17 22.86 3.84 ...
 $ FertilityRate                : num  5.4 1.75 2.83 NA 6.1 2.12 2.2 1.74 1.89 1.44 ...
 $ LifeExpectancy               : int  60 74 73 82 51 75 76 71 82 81 ...
 $ ChildMortality               : num  98.5 16.7 20 3.2 163.5 ...
 $ CellularSubscribers          : num  54.3 96.4 99 75.5 48.4 ...
 $ LiteracyRate                 : num  NA NA NA NA 70.1 99 97.8 99.6 NA NA ...
 $ GNI                          : num  1140 8820 8310 NA 5230 ...
 $ PrimarySchoolEnrollmentMale  : num  NA NA 98.2 78.4 93.1 91.1 NA NA 96.9 NA ...
 $ PrimarySchoolEnrollmentFemale: num  NA NA 96.4 79.4 78.2 84.5 NA NA 97.5 NA ...
資料顯示 Data Display head(), tail()
head(WHO)
              Country                Region Population Under15 Over60 FertilityRate
1         Afghanistan Eastern Mediterranean      29825   47.42   3.82          5.40
2             Albania                Europe       3162   21.33  14.93          1.75
3             Algeria                Africa      38482   27.42   7.17          2.83
4             Andorra                Europe         78   15.20  22.86            NA
5              Angola                Africa      20821   47.58   3.84          6.10
6 Antigua and Barbuda              Americas         89   25.96  12.35          2.12
  LifeExpectancy ChildMortality CellularSubscribers LiteracyRate   GNI
1             60           98.5               54.26           NA  1140
2             74           16.7               96.39           NA  8820
3             73           20.0               98.99           NA  8310
4             82            3.2               75.49           NA    NA
5             51          163.5               48.38         70.1  5230
6             75            9.9              196.41         99.0 17900
  PrimarySchoolEnrollmentMale PrimarySchoolEnrollmentFemale
1                          NA                            NA
2                          NA                            NA
3                        98.2                          96.4
4                        78.4                          79.4
5                        93.1                          78.2
6                        91.1                          84.5
資料總覽 Data Summary summary()
summary(WHO)
   Country             Region            Population         Under15         Over60     
 Length:194         Length:194         Min.   :      1   Min.   :13.1   Min.   : 0.81  
 Class :character   Class :character   1st Qu.:   1696   1st Qu.:18.7   1st Qu.: 5.20  
 Mode  :character   Mode  :character   Median :   7790   Median :28.6   Median : 8.53  
                                       Mean   :  36360   Mean   :28.7   Mean   :11.16  
                                       3rd Qu.:  24535   3rd Qu.:37.8   3rd Qu.:16.69  
                                       Max.   :1390000   Max.   :50.0   Max.   :31.92  
                                                                                       
 FertilityRate  LifeExpectancy ChildMortality   CellularSubscribers  LiteracyRate 
 Min.   :1.26   Min.   :47.0   Min.   :  2.20   Min.   :  2.57      Min.   :31.1  
 1st Qu.:1.83   1st Qu.:64.0   1st Qu.:  8.43   1st Qu.: 63.57      1st Qu.:71.6  
 Median :2.40   Median :72.5   Median : 18.60   Median : 97.75      Median :91.8  
 Mean   :2.94   Mean   :70.0   Mean   : 36.15   Mean   : 93.64      Mean   :83.7  
 3rd Qu.:3.90   3rd Qu.:76.0   3rd Qu.: 55.98   3rd Qu.:120.81      3rd Qu.:97.8  
 Max.   :7.58   Max.   :83.0   Max.   :181.60   Max.   :196.41      Max.   :99.8  
 NA's   :11                                     NA's   :10          NA's   :91    
      GNI        PrimarySchoolEnrollmentMale PrimarySchoolEnrollmentFemale
 Min.   :  340   Min.   : 37.2               Min.   : 32.5                
 1st Qu.: 2335   1st Qu.: 87.7               1st Qu.: 87.3                
 Median : 7870   Median : 94.7               Median : 95.1                
 Mean   :13321   Mean   : 90.8               Mean   : 89.6                
 3rd Qu.:17558   3rd Qu.: 98.1               3rd Qu.: 97.9                
 Max.   :86440   Max.   :100.0               Max.   :100.0                
 NA's   :32      NA's   :93                  NA's   :93                   


基本資料檢視 Basic Data Examination

15歲以下人口比率 mean(), sd()

mean(WHO$Under15)
[1] 28.73
sd(WHO$Under15)
[1] 10.53
summary(WHO$Under15)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   13.1    18.7    28.6    28.7    37.8    50.0 

15歲以下人口比率最低的國家 which(), which.min(), which.max

which.min(WHO$Under15)
[1] 86
WHO$Country[86]
[1] "Japan"
WHO$Country[which.min(WHO$Under15)]
[1] "Japan"

15歲以下人口比率最高的國家

which.max(WHO$Under15)
[1] 124
WHO$Country[124]
[1] "Niger"
WHO$Country[which.max(WHO$Under15)]
[1] "Niger"


資料篩選 Subsetting subset(). nrow()
Outliers = subset(WHO, GNI > 10000 & FertilityRate > 2.5) 
nrow(Outliers)
[1] 7
Outliers[c("Country","GNI","FertilityRate")]
              Country   GNI FertilityRate
23           Botswana 14550          2.71
56  Equatorial Guinea 25620          5.04
63              Gabon 13740          4.18
83             Israel 27110          2.92
88         Kazakhstan 11250          2.52
131            Panama 14510          2.52
150      Saudi Arabia 24700          2.76
點狀圖 Scatterplot plot()

觀察兩變數之間的關係

par(cex=0.8, mar=c(4,4,2,1)) 
plot(WHO$GNI, WHO$FertilityRate)
abline(v=10000, h=2.5, col='red')


分類計數功能 table()
table(WHO$Region)

               Africa              Americas Eastern Mediterranean                Europe 
                   46                    35                    22                    53 
      South-East Asia       Western Pacific 
                   11                    27 


分類數量功能 table()

觀察『類別』的頻率

par(mar=c(12,3,4,2),cex=0.8)
table(WHO$Region) %>% barplot(las=2, main="No. Country in Each Region")


直方圖 Histogram hist()

觀察『數量』的分布

hist(WHO$CellularSubscribers)

盒狀圖 Boxplot boxplot

比較各『類別』的『數量』分布

boxplot(WHO$LifeExpectancy ~ WHO$Region)

boxplot(WHO$LifeExpectancy ~ WHO$Region, 
        xlab = "", ylab = "Life Expectancy", 
        main = "Life Expectancy of Countries by Region")

分類統計功能 tapply()
tapply(WHO$Over60, WHO$Region, mean)
               Africa              Americas Eastern Mediterranean                Europe 
                5.221                10.944                 5.620                19.775 
      South-East Asia       Western Pacific 
                8.769                10.163 
tapply(WHO$LiteracyRate, WHO$Region, min)
               Africa              Americas Eastern Mediterranean                Europe 
                   NA                    NA                    NA                    NA 
      South-East Asia       Western Pacific 
                   NA                    NA 
tapply(WHO$LiteracyRate, WHO$Region, min, na.rm=TRUE)
               Africa              Americas Eastern Mediterranean                Europe 
                 31.1                  75.2                  63.9                  95.2 
      South-East Asia       Western Pacific 
                 56.8                  60.6 



【CASE 2】食物成分資料 USDA Dataset

載入、檢視資料 Video2 - Reading in the Dataset
USDA = read.csv("data/USDA.csv")
summary(USDA)
       ID        Description           Calories      Protein         TotalFat     
 Min.   : 1001   Length:7058        Min.   :  0   Min.   : 0.00   Min.   :  0.00  
 1st Qu.: 8387   Class :character   1st Qu.: 85   1st Qu.: 2.29   1st Qu.:  0.72  
 Median :13294   Mode  :character   Median :181   Median : 8.20   Median :  4.37  
 Mean   :14260                      Mean   :220   Mean   :11.71   Mean   : 10.32  
 3rd Qu.:18337                      3rd Qu.:331   3rd Qu.:20.43   3rd Qu.: 12.70  
 Max.   :93600                      Max.   :902   Max.   :88.32   Max.   :100.00  
                                    NA's   :1     NA's   :1       NA's   :1       
  Carbohydrate        Sodium       SaturatedFat    Cholesterol         Sugar     
 Min.   :  0.00   Min.   :    0   Min.   : 0.00   Min.   :   0.0   Min.   : 0.0  
 1st Qu.:  0.00   1st Qu.:   37   1st Qu.: 0.17   1st Qu.:   0.0   1st Qu.: 0.0  
 Median :  7.13   Median :   79   Median : 1.26   Median :   3.0   Median : 1.4  
 Mean   : 20.70   Mean   :  322   Mean   : 3.45   Mean   :  41.6   Mean   : 8.3  
 3rd Qu.: 28.17   3rd Qu.:  386   3rd Qu.: 4.03   3rd Qu.:  69.0   3rd Qu.: 7.9  
 Max.   :100.00   Max.   :38758   Max.   :95.60   Max.   :3100.0   Max.   :99.8  
 NA's   :1        NA's   :84      NA's   :301     NA's   :288      NA's   :1910  
    Calcium          Iron          Potassium        VitaminC         VitaminE    
 Min.   :   0   Min.   :  0.00   Min.   :    0   Min.   :   0.0   Min.   :  0.0  
 1st Qu.:   9   1st Qu.:  0.52   1st Qu.:  135   1st Qu.:   0.0   1st Qu.:  0.1  
 Median :  19   Median :  1.33   Median :  250   Median :   0.0   Median :  0.3  
 Mean   :  74   Mean   :  2.83   Mean   :  301   Mean   :   9.4   Mean   :  1.5  
 3rd Qu.:  56   3rd Qu.:  2.62   3rd Qu.:  348   3rd Qu.:   3.1   3rd Qu.:  0.7  
 Max.   :7364   Max.   :123.60   Max.   :16500   Max.   :2400.0   Max.   :149.4  
 NA's   :136    NA's   :123      NA's   :409     NA's   :332      NA's   :2720   
    VitaminD    
 Min.   :  0.0  
 1st Qu.:  0.0  
 Median :  0.0  
 Mean   :  0.6  
 3rd Qu.:  0.1  
 Max.   :250.0  
 NA's   :2834   


簡單資料分析 Video 3 - Basic Data Analysis

鹽分含量: 當資料很大的時候 head(), range(), summary(), hist()

nrow(USDA)
[1] 7058
head(USDA$Sodium, 300)
  [1]   714   827     2  1395   560   629   842   690   621   700   604   364   344   330
 [15]   330   406   321   965  1116   800   600   819   336   800   536   627   415   619
 [29]   652   628   334  1529  1602   534   876    84   125  1200  1809   192   753  1671
 [43]  1428  1370   966  1284  1552  1625    41    40    34    38     8    89    80    54
 [57]    48    57    57    79    79   181   122    66    62    25   102    78    43    49
 [71]    47    52    59    44    52    58    42    53    59   105     3   371   535   549
 [85]  2280   517   127   106   115    60    66    61    44    50    17    52    44   111
 [99]    95    48   968    54  1079    46    70    77    66    53    58    65   142   166
[113]    48    67    67   207   124   155   297   145   523   548  1156  1238   135   146
[127]   138   141   151   800    11  1696    42    59   106   535   549   346   515   368
[141]  3780   685   132   493  1131   655   617   612    21   133   158  1280    47    44
[155]    70    71   141   725    59  1150   702   135  1638  1705  1298  1499  1586    59
[169]    72    52    45    55    45   100    72    66    58  1529    80   196   876   776
[183]   576    43   371   106  1587    53    58    58    58    66    59   135   329   250
[197]   129   199  1808   751   704   105    66    81    61    68   129   162    94   146
[211]   164   110    92    92   696  1272  1671  1284  3663    77    16    76    23    17
[225]    18   160    83  1640    10   277   211    35   168    52    20   208    88    67
[239]    60    27    80    77    13    16    73    25    68   452    20    30     5    26
[253]    27    52    50   148    11    24    62    55    38     4    61  1135 38758     5
[267]     9     9     4     3     2  2964   314    26    31    30   344     8    23  8068
[281]    14   227   260    52   173    36    36    25     0     0   633   750   788  1133
[295]  1000   863   711   838  1074   868
range(USDA$Sodium, na.rm=T)
[1]     0 38758
summary(USDA$Sodium)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
      0      37      79     322     386   38758      84 
hist(USDA$Sodium)

hist(log(USDA$Sodium))

鹽分含量最高的食物

# Finding the index of the food with highest sodium levels
which.max(USDA$Sodium)
[1] 265
# Get the name of the food with highest sodium levels
USDA$Description[265]
[1] "SALT,TABLE"

高鹽食物

# Create a subset of the foods with sodium content above 10,000mg
HighSodium = subset(USDA, Sodium>10000)
# Count the number of rows, or observations
nrow(HighSodium)
[1] 10
# Output names of the foods with high sodium content
HighSodium$Description
 [1] "SALT,TABLE"                                             
 [2] "SOUP,BF BROTH OR BOUILLON,PDR,DRY"                      
 [3] "SOUP,BEEF BROTH,CUBED,DRY"                              
 [4] "SOUP,CHICK BROTH OR BOUILLON,DRY"                       
 [5] "SOUP,CHICK BROTH CUBES,DRY"                             
 [6] "GRAVY,AU JUS,DRY"                                       
 [7] "ADOBO FRESCO"                                           
 [8] "LEAVENING AGENTS,BAKING PDR,DOUBLE-ACTING,NA AL SULFATE"
 [9] "LEAVENING AGENTS,BAKING SODA"                           
[10] "DESSERTS,RENNIN,TABLETS,UNSWTND"                        

魚子醬 match()

# Finding the index of CAVIAR in the dataset
match("CAVIAR", USDA$Description)
[1] 4154
# Find amount of sodium in caviar
USDA$Sodium[4154]
[1] 1500
# Doing it in one command!
USDA$Sodium[match("CAVIAR", USDA$Description)]
[1] 1500
USDA$Sodium[ USDA$Description == "CAVIAR" ] 
[1] 1500

統計量

  # Summary function over Sodium vector
  summary(USDA$Sodium)
     Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
        0      37      79     322     386   38758      84 
  # Standard deviation
  sd(USDA$Sodium, na.rm = TRUE)
  [1] 1045


基本繪圖 Video 4 - Plots

點狀圖:蛋白質 vs. 脂肪

# Scatter Plots
plot(USDA$Protein, USDA$TotalFat)

# Add xlabel, ylabel and title
plot(USDA$Protein, USDA$TotalFat, 
     xlab="Protein", ylab = "Fat", 
     main = "Protein vs Fat", col = "red")

直方圖:維他命C

# Creating a histogram
hist(USDA$VitaminC, xlab = "Vitamin C (mg)", main = "Histogram of Vitamin C")

# Add limits to x-axis
hist(USDA$VitaminC, xlab = "Vitamin C (mg)", 
     main = "Histogram of Vitamin C", xlim = c(0,100))

# Specify breaks of histogram
hist(USDA$VitaminC, xlab = "Vitamin C (mg)", 
     main = "Histogram of Vitamin C", xlim = c(0,100), breaks=100)

hist(USDA$VitaminC, xlab = "Vitamin C (mg)", 
     main = "Histogram of Vitamin C", xlim = c(0,100), breaks=2000)

盒狀圖:糖分

# Boxplots
boxplot(USDA$Sugar, ylab = "Sugar (g)", main = "Boxplot of Sugar")


定義新欄位 Video 5 - Adding a variable
# Creating a variable that takes value 1 if the food has higher sodium
HighSodium = as.numeric(USDA$Sodium > mean(USDA$Sodium, na.rm=TRUE))

# Adding the variable to the dataset
USDA$HighSodium = as.numeric(USDA$Sodium > mean(USDA$Sodium, na.rm=TRUE))

# Similarly for HighProtein, HigCarbs, HighFat
USDA$HighCarbs = as.numeric(USDA$Carbohydrate > mean(USDA$Carbohydrate, na.rm=TRUE))

USDA$HighProtein = as.numeric(USDA$Protein > mean(USDA$Protein, na.rm=TRUE))

USDA$HighFat = as.numeric(USDA$TotalFat > mean(USDA$TotalFat, na.rm=TRUE))


分類計數與分類統計量 Video 6 - Summary Tables

🗿 練習:
以下的運算式分別代表什麼意思 …
  ■ table(USDA$HighSodium)
  ■ table(USDA$HighSodium, USDA$HighFat)
  ■ tapply(USDA$Iron, USDA$HighProtein, mean, na.rm=TRUE)
  ■ tapply(USDA$VitaminC, USDA$HighCarbs, max, na.rm=TRUE)
  ■ tapply(USDA$VitaminC, USDA$HighCarbs, summary, na.rm=TRUE)

table(USDA$HighSodium)

   0    1 
4884 2090 
table(USDA$HighSodium, USDA$HighFat)
   
       0    1
  0 3529 1355
  1 1378  712
tapply(USDA$Iron, USDA$HighProtein, mean, na.rm=TRUE)
    0     1 
2.559 3.197 
tapply(USDA$VitaminC, USDA$HighCarbs, max, na.rm=TRUE)
   0    1 
1678 2400 
tapply(USDA$VitaminC, USDA$HighCarbs, summary, na.rm=TRUE)
$`0`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0     0.0     0.0     6.4     2.8  1677.6     248 

$`1`
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0     0.0     0.2    16.3     4.5  2400.0      83 




📝 UNIT2B 學習重點:
  ■ 資料檢視功能:
    § 資料結構:str()
    § 資料總覽:summary()
    § 資料顯示:head(), tail()
  ■ 基本統計量(Statistics)函數:
    § mean(), median(), sd(), max(), min()
  ■ 基本繪圖功能:
    § 類別變數的分布:直條圖 barplot()
    § 數量變數的分布:直方圖 hist()
    § 兩數量變數的關係:點狀圖 plot()
    § 比較各類別的數量分布:盒狀圖 boxplot()
  ■ 位置函數:
    § which(), which.max(), which.min()
  ■ 最重要的兩個功能:
    § 計算各分類的數量 table()
    § 計算各分類的統計量 tapply()