R語言基本介紹

基本運算

** R就像一台計算機 **

print("Hello R")
## [1] "Hello R"
8+7
## [1] 15
6^5
## [1] 7776
8/2
## [1] 4
sqrt(25)
## [1] 5
log(100,10)
## [1] 2
sum(c(100,200,500)*c(-0.1,0.2,0.3))
## [1] 180
5%%2
## [1] 1

物件式的運算環境

** R 裡面所有的東西都是一個object **

prob <- c(0.2,0.5,0.3)
payoff <- c(500,100,400)
expected_payoff <-sum(prob*payoff)


邏輯值:& (and)、| (or)、!(not)、==(等於)、!=(不等於)

基本向量運算

x=c(1,20,30,55,66)
length(x) #vector 內個數多少
## [1] 5
sum(x)  #個數加總
## [1] 172
prod(x) #相乘
## [1] 2178000
cumsum(x) #累積相加
## [1]   1  21  51 106 172
cumprod(x) #累積相乘
## [1]       1      20     600   33000 2178000
sort(x)  # 排列
## [1]  1 20 30 55 66
rank(x)  # 排列
## [1] 1 2 3 4 5

向量的指標用法

x[2]
## [1] 20
x[6]
## [1] NA
x[c(1,2)]
## [1]  1 20

基本統計用法

mean(x) #平均數
## [1] 34.4
max(x)
## [1] 66
min(x)
## [1] 1
var(x)
## [1] 691.3
median(x)
## [1] 30
sum(x)
## [1] 172
sd(x)
## [1] 26.29258


不用sd function 要如何計算標準差呢?

#標準差
n = length(x)
x.sd = sqrt((sum(x^2)-n*mean(x)^2)/(n-1)) #26.29

奧運資料接觸

安裝及讀取需要的套件

library(readr)

讀取資料檔

athlete<-read_csv("asset/athlete_all.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_integer(),
##   country = col_character(),
##   year = col_integer(),
##   ID = col_integer(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_integer(),
##   Height = col_integer(),
##   NOC = col_character(),
##   Games = col_character(),
##   Season = col_character(),
##   City = col_character(),
##   Sport = col_character(),
##   Event = col_character(),
##   Medal = col_character(),
##   food_suppiy = col_integer(),
##   income_GDP = col_integer(),
##   democracy_score = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not a
## multiple of vector length (arg 1)
## Warning: 42 parsing failures.
## row # A tibble: 5 x 5 col     row col           expected               actual file                    expected   <int> <chr>         <chr>                  <chr>  <chr>                   actual 1  1166 female_school no trailing characters r      'asset/athlete_all.csv' file 2  1167 female_school no trailing characters r      'asset/athlete_all.csv' row 3  1168 female_school no trailing characters r      'asset/athlete_all.csv' col 4  1169 female_school no trailing characters r      'asset/athlete_all.csv' expected 5  1170 female_school no trailing characters r      'asset/athlete_all.csv'
## ... ................. ... ........................................................................... ........ ........................................................................... ...... ........................................................................... .... ........................................................................... ... ........................................................................... ... ........................................................................... ........ ...........................................................................
## See problems(...) for more details.
athlete<-athlete[,-1]

資料摘要

summary(athlete)
##    country               year            ID             Name          
##  Length:271116      Min.   :1896   Min.   :     1   Length:271116     
##  Class :character   1st Qu.:1960   1st Qu.: 34643   Class :character  
##  Mode  :character   Median :1988   Median : 68205   Mode  :character  
##                     Mean   :1978   Mean   : 68249                     
##                     3rd Qu.:2002   3rd Qu.:102097                     
##                     Max.   :2016   Max.   :135571                     
##                                                                       
##      Sex                 Age            Height          Weight     
##  Length:271116      Min.   :10.00   Min.   :127.0   Min.   : 25.0  
##  Class :character   1st Qu.:21.00   1st Qu.:168.0   1st Qu.: 60.0  
##  Mode  :character   Median :24.00   Median :175.0   Median : 70.0  
##                     Mean   :25.56   Mean   :175.3   Mean   : 70.7  
##                     3rd Qu.:28.00   3rd Qu.:183.0   3rd Qu.: 79.0  
##                     Max.   :97.00   Max.   :226.0   Max.   :214.0  
##                     NA's   :9474    NA's   :60171   NA's   :62875  
##      NOC               Games              Season              City          
##  Length:271116      Length:271116      Length:271116      Length:271116     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     Sport              Event              Medal              ce_rate      
##  Length:271116      Length:271116      Length:271116      Min.   : 17.00  
##  Class :character   Class :character   Class :character   1st Qu.: 47.60  
##  Mode  :character   Mode  :character   Mode  :character   Median : 51.70  
##                                                           Mean   : 55.43  
##                                                           3rd Qu.: 58.20  
##                                                           Max.   :118.00  
##                                                           NA's   :83417   
##   electricity        female_school     food_suppiy     health_expense  
##  Min.   :3.500e+09   Min.   : 15.40   Min.   :1440     Min.   : 1.69   
##  1st Qu.:7.350e+10   1st Qu.: 96.50   1st Qu.:2890     1st Qu.: 5.71   
##  Median :2.260e+11   Median :100.00   Median :3130     Median : 7.76   
##  Mean   :6.796e+11   Mean   : 96.84   Mean   :3090     Mean   : 7.88   
##  3rd Qu.:6.020e+11   3rd Qu.:103.00   3rd Qu.:3390     3rd Qu.: 9.49   
##  Max.   :6.140e+12   Max.   :130.00   Max.   :3810     Max.   :20.60   
##  NA's   :166832      NA's   :131069   NA's   :122790   NA's   :206269  
##    income_GDP        baby_pw          co2_em      child_mortality 
##  Min.   :   263   Min.   :1.12    Min.   : 0.01   Min.   :  2.10  
##  1st Qu.:  7390   1st Qu.:1.61    1st Qu.: 3.04   1st Qu.:  7.60  
##  Median : 14800   Median :2.04    Median : 6.48   Median : 19.60  
##  Mean   : 19366   Mean   :2.52    Mean   : 7.34   Mean   : 52.64  
##  3rd Qu.: 30100   3rd Qu.:2.87    3rd Qu.:10.00   3rd Qu.: 60.60  
##  Max.   :136000   Max.   :8.30    Max.   :61.80   Max.   :536.00  
##  NA's   :45550    NA's   :46166   NA's   :59753   NA's   :45763   
##    inflation       unemployment_15    bc_w_death     democracy_score 
##  Min.   : -17.10   Min.   : 0.14    Min.   : 0.53    Min.   :-10.00  
##  1st Qu.:   1.67   1st Qu.: 4.65    1st Qu.:15.50    1st Qu.:  3.00  
##  Median :   3.92   Median : 6.83    Median :20.20    Median : 10.00  
##  Mean   :  17.17   Mean   : 7.56    Mean   :19.48    Mean   :  5.43  
##  3rd Qu.:   8.46   3rd Qu.: 9.62    3rd Qu.:23.80    3rd Qu.: 10.00  
##  Max.   :6040.00   Max.   :35.50    Max.   :46.60    Max.   : 10.00  
##  NA's   :115730    NA's   :154684   NA's   :103013   NA's   :79678
head(table(athlete$country,athlete$Medal), 10)
##                        
##                         Bronze Gold Silver
##   30. Februar                0    0      0
##   A North American Team      4    0      0
##   Acipactli                  0    0      0
##   Acturus                    0    0      0
##   Afghanistan                2    0      0
##   Akatonbo                   0    0      0
##   Alain IV                   0    0      0
##   Albania                    0    0      0
##   Alcaid                     0    0      0
##   Alcyon-6                   0    0      0

確認資料結構

is.data.frame(athlete)
## [1] TRUE

基本資料探索

nrow(athlete) 
## [1] 271116
ncol(athlete)
## [1] 28
dim(athlete)    #資料維度
## [1] 271116     28
class(athlete$year) #看資料型態 
## [1] "integer"
head(athlete)
## # A tibble: 6 x 28
##   country  year     ID Name  Sex     Age Height Weight NOC   Games Season City 
##   <chr>   <int>  <int> <chr> <chr> <int>  <int>  <dbl> <chr> <chr> <chr>  <chr>
## 1 30. Fe…  1952  34666 Hara… M        23    167     70 AUT   1952… Summer Hels…
## 2 30. Fe…  1952 127384 Hara… M        44    176     NA AUT   1952… Summer Hels…
## 3 A Nort…  1900  33106 Jos … M        38     NA     NA MEX   1900… Summer Paris
## 4 A Nort…  1900  33107 Jos … M        42     NA     NA MEX   1900… Summer Paris
## 5 A Nort…  1900  33109 Jos … M        44     NA     NA MEX   1900… Summer Paris
## 6 A Nort…  1900 131733 Will… M        NA     NA     NA USA   1900… Summer Paris
## # … with 16 more variables: Sport <chr>, Event <chr>, Medal <chr>,
## #   ce_rate <dbl>, electricity <dbl>, female_school <dbl>, food_suppiy <int>,
## #   health_expense <dbl>, income_GDP <int>, baby_pw <dbl>, co2_em <dbl>,
## #   child_mortality <dbl>, inflation <dbl>, unemployment_15 <dbl>,
## #   bc_w_death <dbl>, democracy_score <int>
names(athlete) #看變數名稱
##  [1] "country"         "year"            "ID"              "Name"           
##  [5] "Sex"             "Age"             "Height"          "Weight"         
##  [9] "NOC"             "Games"           "Season"          "City"           
## [13] "Sport"           "Event"           "Medal"           "ce_rate"        
## [17] "electricity"     "female_school"   "food_suppiy"     "health_expense" 
## [21] "income_GDP"      "baby_pw"         "co2_em"          "child_mortality"
## [25] "inflation"       "unemployment_15" "bc_w_death"      "democracy_score"

傳回資料框架變數

athlete[2]
## # A tibble: 271,116 x 1
##     year
##    <int>
##  1  1952
##  2  1952
##  3  1900
##  4  1900
##  5  1900
##  6  1900
##  7  1964
##  8  1964
##  9  1964
## 10  1948
## # … with 271,106 more rows
athlete["year"]
## # A tibble: 271,116 x 1
##     year
##    <int>
##  1  1952
##  2  1952
##  3  1900
##  4  1900
##  5  1900
##  6  1900
##  7  1964
##  8  1964
##  9  1964
## 10  1948
## # … with 271,106 more rows

傳回向量

head(athlete$year, 10)
##  [1] 1952 1952 1900 1900 1900 1900 1964 1964 1964 1948
athlete[,2]
## # A tibble: 271,116 x 1
##     year
##    <int>
##  1  1952
##  2  1952
##  3  1900
##  4  1900
##  5  1900
##  6  1900
##  7  1964
##  8  1964
##  9  1964
## 10  1948
## # … with 271,106 more rows

資料框合併

region<-read_csv("asset/noc_regions.csv")
## Parsed with column specification:
## cols(
##   NOC = col_character(),
##   region = col_character(),
##   notes = col_character()
## )
region<-region[,-3]

兩者取相同欄位

ar<-merge(athlete,region,by="NOC",all.x = T)

建立資料框架

weight<-c(65,75,80,87)
height<-c(155,165,175,185)
gender<-c("F","F","M","M")
nsysu<-data.frame(weight,height,gender)
major<-c("chinese","math","BM","finance")
age<-c(20,27,28,29)
nsysu2<-data.frame(major,age)

合併欄位

nsysu<-cbind(nsysu,nsysu2)

合併列數

weight<-c(85,95,100,87)
height<-c(155,155,145,185)
gender<-c("M","F","M","M")
major<-c("chinese","math","BM","finance")
age<-c(25,25,25,24)
nsysu2<-data.frame(weight,height,gender,major,age)
nsysu<-rbind(nsysu,nsysu2)